Browse Source

fix issue with unfccc scaper and update BTR list

Johannes Gütschow 5 months ago
parent
commit
d74ea807c4

+ 3 - 0
.gitignore

@@ -13,6 +13,9 @@ geckodriver.log
 
 # Databases
 *.db
+*.db.bak
+*.db.dat
+*.db.dir
 
 # Jupyter cache
 .jupyter_cache

+ 1 - 1
downloaded_data/UNFCCC/submissions-BTR1.csv

@@ -1 +1 @@
-../../.git/annex/objects/mF/81/MD5E-s4170--ebf216df90a32fc9af06099f928504fd.csv/MD5E-s4170--ebf216df90a32fc9af06099f928504fd.csv
+../../.git/annex/objects/2q/2F/MD5E-s5262--359e490dfbfd554643e66b0913d6368c.csv/MD5E-s5262--359e490dfbfd554643e66b0913d6368c.csv

File diff suppressed because it is too large
+ 433 - 370
poetry.lock


+ 2 - 2
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_btr.py

@@ -40,7 +40,7 @@ if __name__ == "__main__":
     # set options for headless mode
     profile_path = ".firefox"
     options = Options()
-    options.add_argument("-headless")
+    # options.add_argument("-headless")
 
     # create profile for headless mode and automatic downloading
     options.set_preference("profile", profile_path)
@@ -72,7 +72,7 @@ if __name__ == "__main__":
         if "href" not in link.attrs:
             continue
         href = link.attrs["href"]
-        if "/documents/" in href:
+        if "documents/" in href:
             if "title" in link.attrs.keys():
                 title = link.attrs["title"]
             else:

+ 11 - 2
src/unfccc_ghg_data/unfccc_downloader/unfccc_submission_info.py

@@ -43,6 +43,7 @@ def get_unfccc_submission_info(  # noqa: PLR0912, PLR0915
     info = []
     pattern = re.compile(r"BUR ?\d")
     pattern_NC = re.compile(r"NC ?\d")
+    pattern_BTR = re.compile(r"BTR ?\d")
     i = 0
     last_excep = None
     while i < max_tries:
@@ -72,7 +73,11 @@ def get_unfccc_submission_info(  # noqa: PLR0912, PLR0915
             if match:
                 kind = match.group(0).replace(" ", "")
             else:
-                kind = None
+                match = pattern_BTR.search(title)
+                if match:
+                    kind = match.group(0).replace(" ", "")
+                else:
+                    kind = None
 
         # TODO: might improve speed by first searching for class="document-line"
         #  and then operating on thie resulting subtree for the info
@@ -97,7 +102,11 @@ def get_unfccc_submission_info(  # noqa: PLR0912, PLR0915
 
         # get files
         sub_files = html.find(
-            class_=["form-select form-control", "form-select form-control download"]
+            class_=[
+                "form-select form-control",
+                "form-select form-control download",
+                "small-download form-select form-control download",
+            ]
         )
         if sub_files:
             files = sub_files.find_all("option", value=True)

Some files were not shown because too many files changed in this diff