3 年之前 · 146033ed96
--- a/Makefile
+++ b/Makefile
@@ -1,28 +1,8 @@
 
				 .SILENT: help
			
 
				 help:
			
 
				 	echo Options:
			
 
				-	echo make update-bur: Update list of BUR submissions
			
 
				-	echo make download-bur: Download BUR submissions
			
 
				-	echo make update-nc: Update list of NC submissions
			
 
				-	echo make download-nc: Download NC submissions
			
 
				-	echo make download-ndc: Download NDC submissions
			
 
				 	echo make venv: create virtual environment
			
 
				 
			
 
				-update-bur: venv
			
 
				-	datalad run -m "Fetch BUR submissions" -o downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_bur.py
			
 
				-
			
 
				-download-bur: venv
			
 
				-	datalad run -m "Download BUR submissions" -i downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/download_bur.py
			
 
				-
			
 
				-update-nc: venv
			
 
				-	datalad run -m "Fetch NC submissions" -o downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_nc.py
			
 
				-
			
 
				-download-nc: venv
			
 
				-	datalad run -m "Download NC submissions" -i downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/download_nc.py
			
 
				-
			
 
				-download-ndc: venv
			
 
				-	datalad run -m "Download NDC submissions" ./venv/bin/python code/UNFCCC_downloader/download_ndc.py
			
 
				-
			
 
				 venv: code/requirements.txt
			
 
				 	[ -d ./venv ] || python3 -m venv venv
			
 
				 	./venv/bin/pip install --upgrade pip
			
--- a/code/UNFCCC_downloader/download_annexI.py
+++ b/code/UNFCCC_downloader/download_annexI.py
@@ -0,0 +1,181 @@
 
				+import argparse
			
 
				+import pandas as pd
			
 
				+import requests
			
 
				+import shutil
			
 
				+import time
			
 
				+import os
			
 
				+import zipfile
			
 
				+from datetime import date
			
 
				+from selenium.webdriver import Firefox
			
 
				+from selenium.webdriver.firefox.options import Options
			
 
				+from random import randrange
			
 
				+from pathlib import Path
			
 
				+
			
 
				+root = Path(__file__).parents[2]
			
 
				+
			
 
				+###############
			
 
				+#
			
 
				+# TODO
			
 
				+# download directly via selenium see link below
			
 
				+# https://sqa.stackexchange.com/questions/2197/
			
 
				+# how-to-download-a-file-using-seleniums-webdriver
			
 
				+# for automatic downloading see https://stackoverflow.com/questions/70740163/
			
 
				+# python-selenium-firefox-driver-dismiss-open-save-file-popup
			
 
				+###############
			
 
				+
			
 
				+descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
			
 
				+        'Based on download.py from national-inventory-submissions ' \
			
 
				+        '(https://github.com/openclimatedata/national-inventory-submisions)'
			
 
				+parser = argparse.ArgumentParser(description=descr)
			
 
				+parser.add_argument(
			
 
				+    '--category',
			
 
				+    help='Category to download, CRF, NIR, SEF'
			
 
				+)
			
 
				+parser.add_argument(
			
 
				+    '--year',
			
 
				+    help='Year to download'
			
 
				+)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+year = args.year
			
 
				+category = args.category.upper()
			
 
				+dataset = category + year
			
 
				+print(f"Downloading data for {dataset}")
			
 
				+
			
 
				+# generate the correct url
			
 
				+url = (
			
 
				+    "https://unfccc.int/process/transparency-and-reporting/"
			
 
				+    "reporting-and-review-under-the-convention/"
			
 
				+    "greenhouse-gas-inventories-annex-i-parties/"
			
 
				+    "submissions/national-inventory-submissions-{}".format(year)
			
 
				+)
			
 
				+
			
 
				+# TODO: years before 2019
			
 
				+if int(year) == 2019:
			
 
				+    url = (
			
 
				+           "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
			
 
				+           "reporting-and-review-under-the-convention/"
			
 
				+           "greenhouse-gas-inventories-annex-i-parties/"
			
 
				+           "national-inventory-submissions-{}".format(year)
			
 
				+          )
			
 
				+
			
 
				+if int(year) >= 2020:
			
 
				+    url = (
			
 
				+            "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
			
 
				+            )
			
 
				+
			
 
				+download_path = root / "downloaded_data" / "UNFCCC"
			
 
				+
			
 
				+error_file_sizes = [212, 210]
			
 
				+
			
 
				+# Read submissions list
			
 
				+submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
			
 
				+
			
 
				+# filter submissions list or category
			
 
				+items = submissions[submissions.Kind  == category.upper()]
			
 
				+
			
 
				+# set options for headless mode
			
 
				+profile_path = ".firefox"
			
 
				+options = Options()
			
 
				+#options.add_argument('-headless')
			
 
				+
			
 
				+# create profile for headless mode and automatic downloading
			
 
				+options.set_preference('profile', profile_path)
			
 
				+options.set_preference('browser.download.folderList', 2)
			
 
				+
			
 
				+# set up selenium driver
			
 
				+driver = Firefox(options=options)
			
 
				+# visit the main data page once to create cookies
			
 
				+driver.get(url)
			
 
				+
			
 
				+# wait a bit for the website to load before we get the cokkies
			
 
				+time.sleep(20)
			
 
				+
			
 
				+# get the session id cookie
			
 
				+cookies_selenium = driver.get_cookies()
			
 
				+cookies = {}
			
 
				+for cookie in cookies_selenium:
			
 
				+    cookies[cookie['name']] = cookie['value']
			
 
				+
			
 
				+new_downloaded = []
			
 
				+
			
 
				+for idx, submission in items.iterrows():
			
 
				+    print("=" * 60)
			
 
				+    title = submission.Title
			
 
				+    url = submission.URL
			
 
				+    country = submission.Country
			
 
				+    country = country.replace(' ', '_')
			
 
				+    print(f"Downloading {title} from {url}")
			
 
				+
			
 
				+    country_folder = download_path / country
			
 
				+    if not country_folder.exists():
			
 
				+        country_folder.mkdir()
			
 
				+    local_filename = \
			
 
				+        country_folder / dataset / \
			
 
				+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				+    if not local_filename.parent.exists():
			
 
				+        local_filename.parent.mkdir()
			
 
				+
			
 
				+    if local_filename.exists():
			
 
				+        # check file size. if 210 or 212 bytes it's the error page
			
 
				+        if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				+            # found the error page. delete file
			
 
				+            os.remove(local_filename)
			
 
				+    
			
 
				+    # now we have removed error pages, so a present file should not be overwritten
			
 
				+    if not local_filename.exists():
			
 
				+        i = 0  # reset counter
			
 
				+        while not local_filename.exists() and i < 10:
			
 
				+            # for i = 0 and i = 5 try to get a new session ID
			
 
				+            if i == 1 or i == 5:
			
 
				+                driver = webdriver.Firefox(options=options)
			
 
				+    
			
 
				+                # visit the main data page once to create cookies
			
 
				+                driver.get(url)
			
 
				+                time.sleep(20)
			
 
				+
			
 
				+                # get the session id cookie
			
 
				+                cookies_selenium = driver.get_cookies()
			
 
				+                cookies = {}
			
 
				+                for cookie in cookies_selenium:
			
 
				+                    cookies[cookie['name']] = cookie['value']
			
 
				+
			
 
				+            r = requests.get(url, stream=True, cookies=cookies)
			
 
				+            with open(str(local_filename), 'wb') as f:
			
 
				+                shutil.copyfileobj(r.raw, f)
			
 
				+            
			
 
				+            # check file size. if 210 or 212 bytes it's the error page
			
 
				+            if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				+                # found the error page. delete file
			
 
				+                os.remove(local_filename)
			
 
				+            
			
 
				+            # sleep a bit to avoid running into captchas
			
 
				+            time.sleep(randrange(5, 15))
			
 
				+            
			
 
				+        if local_filename.exists():
			
 
				+            new_downloaded.append(submission)
			
 
				+            print(f"Download => {local_filename.relative_to(root)}")
			
 
				+            # unzip data (only for new downloads)
			
 
				+            if local_filename.suffix == ".zip":
			
 
				+                try:
			
 
				+                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
			
 
				+                    zipped_file.extractall(str(local_filename.parent))
			
 
				+                    print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				+                    zipped_file.close()
			
 
				+                # TODO Better error logging/visibilty
			
 
				+                except zipfile.BadZipFile:
			
 
				+                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
			
 
				+                except NotImplementedError:
			
 
				+                    print("Zip format not supported, please unzip on the command line.")
			
 
				+            else:
			
 
				+                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
			
 
				+        else:
			
 
				+            print(f"Failed to download {local_filename.relative_to(root)}")
			
 
				+
			
 
				+    else:
			
 
				+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
			
 
				+
			
 
				+driver.close()
			
 
				+
			
 
				+df = pd.DataFrame(new_downloaded)
			
 
				+df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
			
--- a/code/UNFCCC_downloader/download_bur.py
+++ b/code/UNFCCC_downloader/download_bur.py
@@ -1,146 +0,0 @@
 
				-import pandas as pd
			
 
				-import requests
			
 
				-import shutil
			
 
				-import time
			
 
				-import os
			
 
				-from datetime import date
			
 
				-from selenium.webdriver import Firefox
			
 
				-from selenium.webdriver.firefox.options import Options
			
 
				-from random import randrange
			
 
				-
			
 
				-from pathlib import Path
			
 
				-root = Path(__file__).parents[2]
			
 
				-"""
			
 
				-based on download_bur from national-inventory-submissions
			
 
				-# (https://github.com/openclimatedata/national-inventory-submisions)
			
 
				-"""
			
 
				-
			
 
				-###############
			
 
				-#
			
 
				-# TODO
			
 
				-# download directly via selenium see link below
			
 
				-# https://sqa.stackexchange.com/questions/2197/
			
 
				-# how-to-download-a-file-using-seleniums-webdriver
			
 
				-###############
			
 
				-
			
 
				-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
			
 
				-                          "submissions-bur.csv")
			
 
				-
			
 
				-url = "https://unfccc.int/BURs"
			
 
				-
			
 
				-# if we get files of this size they are error pages and we need to
			
 
				-# try the download again
			
 
				-error_file_sizes = [212, 210]
			
 
				-
			
 
				-# find which BUR submission rounds exist
			
 
				-present_BURs = submissions.Kind.unique()
			
 
				-
			
 
				-# Ensure download path and subfolders exist
			
 
				-download_path = root / "downloaded_data/UNFCCC"
			
 
				-if not download_path.exists():
			
 
				-    download_path.mkdir(parents=True)
			
 
				-
			
 
				-# set options for headless mode
			
 
				-profile_path = ".firefox"
			
 
				-options = Options()
			
 
				-#options.add_argument('-headless')
			
 
				-
			
 
				-# create profile for headless mode and automatic downloading
			
 
				-options.set_preference('profile', profile_path)
			
 
				-options.set_preference('browser.download.folderList', 2)
			
 
				-
			
 
				-# set up selenium driver
			
 
				-driver = Firefox(options=options)
			
 
				-# visit the main data page once to create cookies
			
 
				-driver.get(url)
			
 
				-time.sleep(20)
			
 
				-
			
 
				-# get the session id cookie
			
 
				-cookies_selenium = driver.get_cookies()
			
 
				-cookies = {}
			
 
				-for cookie in cookies_selenium:
			
 
				-    cookies[cookie['name']] = cookie['value']
			
 
				-
			
 
				-print(cookies)
			
 
				-
			
 
				-new_downloaded = []
			
 
				-
			
 
				-for idx, submission in submissions.iterrows():
			
 
				-    print("=" * 60)
			
 
				-    bur = submission.Kind
			
 
				-    title = submission.Title
			
 
				-    url = submission.URL
			
 
				-    country = submission.Country
			
 
				-    country = country.replace(' ', '_')
			
 
				-    print(title)
			
 
				-
			
 
				-    country_folder = download_path / country
			
 
				-    if not country_folder.exists():
			
 
				-        country_folder.mkdir()
			
 
				-    local_filename = country_folder / bur / url.split('/')[-1]
			
 
				-    local_filename_underscore = \
			
 
				-        download_path / country / bur / \
			
 
				-        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				-    if not local_filename.parent.exists():
			
 
				-        local_filename.parent.mkdir()
			
 
				-
			
 
				-    ### remove, not needed as no legacy data present
			
 
				-    #if local_filename.exists():
			
 
				-    #    # rename
			
 
				-    #    local_filename.rename(local_filename_underscore)
			
 
				-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
			
 
				-
			
 
				-    # this should never be needed but in case anything goes wrong and
			
 
				-    # an error page is present it should be overwritten
			
 
				-    if local_filename_underscore.exists():
			
 
				-        # check file size. if 210 or 212 bytes it's the error page
			
 
				-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				-            # found the error page. delete file
			
 
				-            os.remove(local_filename_underscore)
			
 
				-    
			
 
				-    # now we have remove error pages, so a present file should not be overwritten
			
 
				-    if not local_filename_underscore.exists():
			
 
				-        i = 0  # reset counter
			
 
				-        while not local_filename_underscore.exists() and i < 10:
			
 
				-            # for i = 0 and i = 5 try to get a new session ID
			
 
				-            if i == 1 or i == 5:
			
 
				-                driver = webdriver.Firefox(options=options,
			
 
				-                                           firefox_profile=profile)
			
 
				-    
			
 
				-                # visit the main data page once to create cookies
			
 
				-                driver.get(url)
			
 
				-                time.sleep(20)
			
 
				-                
			
 
				-                # get the session id cookie
			
 
				-                cookies_selenium = driver.get_cookies()
			
 
				-                cookies = {}
			
 
				-                for cookie in cookies_selenium:
			
 
				-                    cookies[cookie['name']] = cookie['value']
			
 
				-                    
			
 
				-            r = requests.get(url, stream=True, cookies=cookies)
			
 
				-            with open(str(local_filename_underscore), 'wb') as f:
			
 
				-                shutil.copyfileobj(r.raw, f)
			
 
				-            
			
 
				-            # check file size. if 210 or 212 bytes it's the error page
			
 
				-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				-                # found the error page. delete file
			
 
				-                os.remove(local_filename_underscore)
			
 
				-            
			
 
				-            # sleep a bit to avoid running into captchas
			
 
				-            time.sleep(randrange(5, 15))
			
 
				-            
			
 
				-        if local_filename_underscore.exists():
			
 
				-            new_downloaded.append(submission)
			
 
				-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
			
 
				-                  "/" + local_filename_underscore.name)
			
 
				-        else:
			
 
				-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
			
 
				-                  + bur + "/" + local_filename_underscore.name)
			
 
				-
			
 
				-    else:
			
 
				-        print("=> Already downloaded " + local_filename_underscore.name)
			
 
				-
			
 
				-driver.close()
			
 
				-
			
 
				-df = pd.DataFrame(new_downloaded)
			
 
				-df.to_csv(download_path / "00_new_downloads_bur-{}.csv".format(date.today()), index=False)
			
--- a/code/UNFCCC_downloader/download_non-annexI.py
+++ b/code/UNFCCC_downloader/download_non-annexI.py
@@ -1,3 +1,4 @@
 
				+import argparse
			
 
				 import pandas as pd
			
 
				 import requests
			
 
				 import shutil
			
@@ -7,13 +8,9 @@ from datetime import date
 
				 from selenium.webdriver import Firefox
			
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				-
			
 
				 from pathlib import Path
			
 
				+
			
 
				 root = Path(__file__).parents[2]
			
 
				-"""
			
 
				-based on download_bur from national-inventory-submissions
			
 
				-# (https://github.com/openclimatedata/national-inventory-submisions)
			
 
				-"""
			
 
				 
			
 
				 ###############
			
 
				 #
			
@@ -25,19 +22,31 @@ based on download_bur from national-inventory-submissions
 
				 # python-selenium-firefox-driver-dismiss-open-save-file-popup
			
 
				 ###############
			
 
				 
			
 
				-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
			
 
				-                          "submissions-nc.csv")
			
 
				+descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
			
 
				+        'Based on download_bur.py from national-inventory-submissions ' \
			
 
				+        '(https://github.com/openclimatedata/national-inventory-submisions)'
			
 
				+parser = argparse.ArgumentParser(description=descr)
			
 
				+parser.add_argument(
			
 
				+    '--category',
			
 
				+    help='Category to download: BUR, NC'
			
 
				+)
			
 
				 
			
 
				-url = "https://unfccc.int/non-annex-I-NCs"
			
 
				+args = parser.parse_args()
			
 
				+category = args.category.upper()
			
 
				+print(f"Downloading {category} submissions")
			
 
				+
			
 
				+if category == "BUR":
			
 
				+    url = "https://unfccc.int/BURs"
			
 
				+else:
			
 
				+    url = "https://unfccc.int/non-annex-I-NCs"
			
 
				 
			
 
				 # if we get files of this size they are error pages and we need to
			
 
				 # try the download again
			
 
				 error_file_sizes = [212, 210]
			
 
				 
			
 
				-# Ensure download path and subfolders exist
			
 
				+# Read submissions list
			
 
				 download_path = root / "downloaded_data" / "UNFCCC"
			
 
				-if not download_path.exists():
			
 
				-    download_path.mkdir(parents=True)
			
 
				+submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
			
 
				 
			
 
				 # set options for headless mode
			
 
				 profile_path = ".firefox"
			
@@ -53,6 +62,7 @@ driver = Firefox(options=options)
 
				 # visit the main data page once to create cookies
			
 
				 driver.get(url)
			
 
				 
			
 
				+# wait a bit for the website to load before we get the cookies
			
 
				 time.sleep(20)
			
 
				 
			
 
				 # get the session id cookie
			
@@ -61,86 +71,72 @@ cookies = {}
 
				 for cookie in cookies_selenium:
			
 
				     cookies[cookie['name']] = cookie['value']
			
 
				 
			
 
				-print(cookies)
			
 
				-
			
 
				 new_downloaded = []
			
 
				 
			
 
				 for idx, submission in submissions.iterrows():
			
 
				     print("=" * 60)
			
 
				-    bur = submission.Kind
			
 
				+    kind = submission.Kind
			
 
				     title = submission.Title
			
 
				     url = submission.URL
			
 
				     country = submission.Country
			
 
				     country = country.replace(' ', '_')
			
 
				-    print(title)
			
 
				+    print(f"Downloading {title} from {url}")
			
 
				 
			
 
				     country_folder = download_path / country
			
 
				     if not country_folder.exists():
			
 
				         country_folder.mkdir()
			
 
				-    local_filename = country_folder / bur / url.split('/')[-1]
			
 
				-    local_filename_underscore = \
			
 
				-        download_path / country / bur / \
			
 
				+    local_filename = \
			
 
				+        country_folder / kind / \
			
 
				         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				     if not local_filename.parent.exists():
			
 
				         local_filename.parent.mkdir()
			
 
				 
			
 
				-    ### remove, not needed as no legacy data present
			
 
				-    #if local_filename.exists():
			
 
				-    #    # rename
			
 
				-    #    local_filename.rename(local_filename_underscore)
			
 
				-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
			
 
				-
			
 
				-    # this should never be needed but in case anything goes wrong and
			
 
				-    # an error page is present it should be overwritten
			
 
				-    if local_filename_underscore.exists():
			
 
				+    if local_filename.exists():
			
 
				         # check file size. if 210 or 212 bytes it's the error page
			
 
				-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				+        if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				             # found the error page. delete file
			
 
				-            os.remove(local_filename_underscore)
			
 
				+            os.remove(local_filename)
			
 
				     
			
 
				-    # now we have remove error pages, so a present file should not be overwritten
			
 
				-    if not local_filename_underscore.exists():
			
 
				+    # now we have removed error pages, so a present file should not be overwritten
			
 
				+    if not local_filename.exists():
			
 
				         i = 0  # reset counter
			
 
				-        while not local_filename_underscore.exists() and i < 10:
			
 
				+        while not local_filename.exists() and i < 10:
			
 
				             # for i = 0 and i = 5 try to get a new session ID
			
 
				             if i == 1 or i == 5:
			
 
				-                driver = webdriver.Firefox(options=options,
			
 
				-                                           firefox_profile=profile)
			
 
				+                driver = Firefox(options=options)
			
 
				     
			
 
				                 # visit the main data page once to create cookies
			
 
				                 driver.get(url)
			
 
				                 time.sleep(20)
			
 
				-                
			
 
				+
			
 
				                 # get the session id cookie
			
 
				                 cookies_selenium = driver.get_cookies()
			
 
				                 cookies = {}
			
 
				                 for cookie in cookies_selenium:
			
 
				                     cookies[cookie['name']] = cookie['value']
			
 
				-                    
			
 
				+
			
 
				             r = requests.get(url, stream=True, cookies=cookies)
			
 
				-            with open(str(local_filename_underscore), 'wb') as f:
			
 
				+            with open(str(local_filename), 'wb') as f:
			
 
				                 shutil.copyfileobj(r.raw, f)
			
 
				             
			
 
				             # check file size. if 210 or 212 bytes it's the error page
			
 
				-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				+            if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				                 # found the error page. delete file
			
 
				-                os.remove(local_filename_underscore)
			
 
				+                os.remove(local_filename)
			
 
				             
			
 
				             # sleep a bit to avoid running into captchas
			
 
				             time.sleep(randrange(5, 15))
			
 
				             
			
 
				-        if local_filename_underscore.exists():
			
 
				+        if local_filename.exists():
			
 
				             new_downloaded.append(submission)
			
 
				-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
			
 
				-                  "/" + local_filename_underscore.name)
			
 
				+            print(f"Download => {local_filename.relative_to(root)}")
			
 
				         else:
			
 
				-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
			
 
				-                  + bur + "/" + local_filename_underscore.name)
			
 
				+            print(f"Failed to download {local_filename.relative_to(root)}")
			
 
				 
			
 
				     else:
			
 
				-        print("=> Already downloaded " + local_filename_underscore.name)
			
 
				+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
			
 
				 
			
 
				 driver.close()
			
 
				 
			
 
				 df = pd.DataFrame(new_downloaded)
			
 
				-df.to_csv(download_path / "00_new_downloads_nc-{}.csv".format(date.today()), index=False)
			
 
				+df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
			
--- a/code/UNFCCC_downloader/fetch_submissions_annexI.py
+++ b/code/UNFCCC_downloader/fetch_submissions_annexI.py
@@ -0,0 +1,140 @@
 
				+import argparse
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup
			
 
				+from selenium.webdriver import Firefox
			
 
				+from selenium.webdriver.firefox.options import Options
			
 
				+from random import randrange
			
 
				+from unfccc_submission_info import get_unfccc_submission_info
			
 
				+
			
 
				+root = Path(__file__).absolute().parents[2]
			
 
				+
			
 
				+max_tries = 10
			
 
				+
			
 
				+descr = ("Download UNFCCC National Inventory Submissions lists "
			
 
				+         "and create list of submissions as CSV file. Based on "
			
 
				+         "process.py from national-inventory-submissions "
			
 
				+         "(https://github.com/openclimatedata/national-inventory-submisions)")
			
 
				+parser = argparse.ArgumentParser(description=descr)
			
 
				+parser.add_argument(
			
 
				+    '--year',
			
 
				+    help='Year to download'
			
 
				+)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+year = args.year
			
 
				+
			
 
				+print("Fetching submissions for {}".format(year))
			
 
				+
			
 
				+url = (
			
 
				+    "https://unfccc.int/process/transparency-and-reporting/"
			
 
				+    "reporting-and-review-under-the-convention/"
			
 
				+    "greenhouse-gas-inventories-annex-i-parties/"
			
 
				+    "submissions/national-inventory-submissions-{}".format(year)
			
 
				+)
			
 
				+
			
 
				+if int(year) == 2019:
			
 
				+    url = (
			
 
				+        "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
			
 
				+        "reporting-and-review-under-the-convention/"
			
 
				+        "greenhouse-gas-inventories-annex-i-parties/"
			
 
				+        "national-inventory-submissions-{}".format(year)
			
 
				+    )
			
 
				+
			
 
				+if int(year) >= 2020:
			
 
				+    url = (
			
 
				+        "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
			
 
				+    )
			
 
				+
			
 
				+print(f"Using {url} to get submissions list")
			
 
				+
			
 
				+# set options for headless mode
			
 
				+profile_path = ".firefox"
			
 
				+options = Options()
			
 
				+options.add_argument('-headless')
			
 
				+
			
 
				+# create profile for headless mode and automatic downloading
			
 
				+options.set_preference('profile', profile_path)
			
 
				+
			
 
				+# set up selenium driver
			
 
				+driver = Firefox(options=options)
			
 
				+driver.get(url)
			
 
				+
			
 
				+html = BeautifulSoup(driver.page_source, "html.parser")
			
 
				+
			
 
				+table = html.find("table")
			
 
				+
			
 
				+# check if table found. if not the get command didn't work, likely because of a captcha on the site
			
 
				+### TODO replace by error message
			
 
				+if not table:
			
 
				+    # try to load html file from disk
			
 
				+    print('Download failed, trying to load manually downloaded file')
			
 
				+    file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
			
 
				+    content = file.read()
			
 
				+    html = BeautifulSoup(content, "html.parser")
			
 
				+    table = html.find("table")
			
 
				+    if not table:
			
 
				+        print(
			
 
				+            "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
			
 
				+            " not found")
			
 
				+        exit()
			
 
				+
			
 
				+links = table.findAll('a')
			
 
				+
			
 
				+targets = []  # sub-pages
			
 
				+downloads = []
			
 
				+no_downloads = []
			
 
				+
			
 
				+# Check links for Zipfiles or subpages
			
 
				+for link in links:
			
 
				+    if "href" not in link.attrs:
			
 
				+        continue
			
 
				+    href = link.attrs["href"]
			
 
				+    if "/documents/" in href:
			
 
				+        if "title" in link.attrs.keys():
			
 
				+            title = link.attrs["title"]
			
 
				+        else:
			
 
				+            title = link.contents[0]
			
 
				+        if href.startswith("/documents"):
			
 
				+            href = "https://unfccc.int" + href
			
 
				+        # Only add pages in the format https://unfccc.int/documents/65587
			
 
				+        # to further downloads
			
 
				+        if str(Path(href).parent).endswith("documents"):
			
 
				+            targets.append({"title": title, "url": href})
			
 
				+    elif href.endswith(".zip"):
			
 
				+        if href.startswith("/files"):
			
 
				+            href = "https://unfccc.int" + href
			
 
				+        country = Path(href).name.split("-")[0].upper()
			
 
				+        title = f"{country} {link.contents[0]}"
			
 
				+        filename = Path(href).name
			
 
				+        file_parts = filename.split('-')
			
 
				+        if len(file_parts) >= 2:
			
 
				+            kind = file_parts[2].upper()
			
 
				+        elif filename.startswith('asr'):
			
 
				+            kind = 'CRF'
			
 
				+        else:
			
 
				+            kind = None
			
 
				+
			
 
				+        print("\t".join([kind, country, title, href]))
			
 
				+        downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
			
 
				+
			
 
				+# Go through sub-pages.
			
 
				+for target in targets:
			
 
				+    time.sleep(randrange(5, 15))
			
 
				+    url = target["url"]
			
 
				+
			
 
				+    submission_info = get_unfccc_submission_info(url, driver, 10)
			
 
				+
			
 
				+    if submission_info:
			
 
				+        downloads = downloads + submission_info
			
 
				+    else:
			
 
				+        no_downloads.append({target["title"], url})
			
 
				+
			
 
				+if len(no_downloads) > 0:
			
 
				+    print("No downloads for ", no_downloads)
			
 
				+
			
 
				+driver.close()
			
 
				+df = pd.DataFrame(downloads)
			
 
				+df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
			
--- a/code/UNFCCC_downloader/fetch_submissions_bur.py
+++ b/code/UNFCCC_downloader/fetch_submissions_bur.py
@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
 
				 from selenium.webdriver import Firefox
			
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				+from unfccc_submission_info import get_unfccc_submission_info
			
 
				 
			
 
				-root = Path(__file__).parents[2]
			
 
				+root = Path(__file__).absolute().parents[2]
			
 
				 
			
 
				 """
			
 
				 Download UNFCCC Biennial Update Report submissions
			
@@ -68,57 +69,14 @@ pattern = re.compile(r"BUR ?\d")
 
				 for target in targets:
			
 
				     time.sleep(randrange(5, 15))
			
 
				     url = target["url"]
			
 
				-    #subpage = requests.get(url, timeout=15.5)
			
 
				-    driver.get(url)
			
 
				-    html = BeautifulSoup(driver.page_source, "html.parser")
			
 
				-    title = html.find("h1").contents[0]
			
 
				-    match = pattern.search(title)
			
 
				-    if match:
			
 
				-        kind = match.group(0).replace(" ", "")
			
 
				-    else:
			
 
				-        kind = None
			
 
				-
			
 
				-
			
 
				-    h2 = html.find("h2", text="Versions")
			
 
				-    if h2:
			
 
				-        div = h2.findNext("div")
			
 
				-        links = div.findAll("a")
			
 
				-        try:
			
 
				-            country = (
			
 
				-                html.find("h2", text="Countries").findNext("div").findNext("div").text
			
 
				-            )
			
 
				-        except AttributeError:
			
 
				-            country = (
			
 
				-                html.find("h2", text="Corporate Author")
			
 
				-                .findNext("div")
			
 
				-                .findNext("div")
			
 
				-                .text
			
 
				-            )
			
 
				-        doctype = (
			
 
				-            html.find("h2", text="Document Type").findNext("div").findNext("div").text
			
 
				-        )
			
 
				-        for link in links:
			
 
				-            url = link.attrs["href"]
			
 
				-            if not kind:
			
 
				-                match = pattern.search(url.upper())
			
 
				-                if match:
			
 
				-                    kind = match.group(0)
			
 
				-                else:
			
 
				-                    if ("NIR" in doctype) or ("NIR" in title):
			
 
				-                        kind = "NIR"
			
 
				-                    elif "NC" in title:
			
 
				-                        kind = "NC"
			
 
				-            downloads.append(
			
 
				-                {
			
 
				-                    "Kind": kind,
			
 
				-                    "Country": country,
			
 
				-                    "Title": title,
			
 
				-                    "URL": url,
			
 
				-                }
			
 
				-            )
			
 
				-        print("\t".join([kind, country, title, url]))
			
 
				+
			
 
				+    submission_info = get_unfccc_submission_info(url, driver, 10)
			
 
				+
			
 
				+    if submission_info:
			
 
				+        downloads = downloads + submission_info
			
 
				     else:
			
 
				-        no_downloads.append((title, url))
			
 
				+        no_downloads.append({target["title"], url})
			
 
				+
			
 
				 
			
 
				 if len(no_downloads) > 0:
			
 
				     print("No downloads for ", no_downloads)
			
--- a/code/UNFCCC_downloader/fetch_submissions_nc.py
+++ b/code/UNFCCC_downloader/fetch_submissions_nc.py
@@ -9,7 +9,7 @@ from selenium.webdriver import Firefox
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				 
			
 
				-root = Path(__file__).parents[2]
			
 
				+root = Path(__file__).absolute().parents[2]
			
 
				 
			
 
				 """
			
 
				 Download UNFCCC Biennial Update Report submissions
			
--- a/code/UNFCCC_downloader/unfccc_submission_info.py
+++ b/code/UNFCCC_downloader/unfccc_submission_info.py
@@ -0,0 +1,89 @@
 
				+# helper functions to gather submission info from UNFCCC website
			
 
				+import time
			
 
				+import re
			
 
				+from random import randrange
			
 
				+from typing import Dict, List
			
 
				+from selenium.webdriver import Firefox
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+
			
 
				+def get_unfccc_submission_info(
			
 
				+        url: str,
			
 
				+        driver: Firefox,
			
 
				+        max_tries: int=10,
			
 
				+
			
 
				+) -> List[Dict[str,str]]:
			
 
				+    info = []
			
 
				+    pattern = re.compile(r"BUR ?\d")
			
 
				+    i = 0
			
 
				+    while i < max_tries:
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+            html = BeautifulSoup(driver.page_source, "html.parser")
			
 
				+            title = html.find("h1").contents[0]
			
 
				+            break
			
 
				+        except AttributeError:
			
 
				+            print(f"Error fetching {url}")
			
 
				+            print("Retrying ...")
			
 
				+            time.sleep(randrange(5, 15))
			
 
				+            i += 1
			
 
				+            continue
			
 
				+
			
 
				+    if i == max_tries:
			
 
				+        print(f"Aborting after {max_tries} tries")
			
 
				+    else:
			
 
				+        match = pattern.search(title)
			
 
				+        if match:
			
 
				+            kind = match.group(0).replace(" ", "")
			
 
				+        else:
			
 
				+            kind = None
			
 
				+
			
 
				+        h2 = html.find("h2", text="Versions")
			
 
				+        if h2:
			
 
				+            div = h2.findNext("div")
			
 
				+            links = div.findAll("a")
			
 
				+            try:
			
 
				+                country = (
			
 
				+                    html.find("h2", text="Countries").findNext("div").findNext("div").text
			
 
				+                )
			
 
				+            except AttributeError:
			
 
				+                country = (
			
 
				+                    html.find("h2", text="Corporate Author")
			
 
				+                    .findNext("div")
			
 
				+                    .findNext("div")
			
 
				+                    .text
			
 
				+                )
			
 
				+            doctype = (
			
 
				+                html.find("h2", text="Document Type").findNext("div").findNext("div").text
			
 
				+            )
			
 
				+            for link in links:
			
 
				+                url = link.attrs["href"]
			
 
				+                if not kind:
			
 
				+                    match = pattern.search(url.upper())
			
 
				+                    if match:
			
 
				+                        kind = match.group(0)
			
 
				+                    else:
			
 
				+                        if ("CRF" in doctype) or ("CRF" in title):
			
 
				+                            kind = "CRF"
			
 
				+                        elif ("SEF" in doctype) or ("SEF" in title):
			
 
				+                            kind = "SEF"
			
 
				+                        elif ("NIR" in doctype) or ("NIR" in title):
			
 
				+                            kind = "NIR"
			
 
				+                        elif "NC" in title:
			
 
				+                            kind = "NC"
			
 
				+                        elif "Status report" in title:
			
 
				+                            kind = "CRF"
			
 
				+                        else:
			
 
				+                            kind = "other"
			
 
				+                info.append({
			
 
				+                    "Kind": kind,
			
 
				+                    "Country": country,
			
 
				+                    "Title": title,
			
 
				+                    "URL": url,
			
 
				+                })
			
 
				+
			
 
				+            print("\t".join([kind, country, title, url]))
			
 
				+        else:
			
 
				+            print(f"No files found for {url}")
			
 
				+
			
 
				+    return info
			
--- a/dodo.py
+++ b/dodo.py
@@ -54,7 +54,7 @@ def task_download_bur():
 
				         # before download
			
 
				         'actions': ['datalad run -m "Download BUR submissions" '
			
 
				                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
			
 
				-                    './venv/bin/python code/UNFCCC_downloader/download_bur.py'],
			
 
				+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=BUR.py'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -72,6 +72,8 @@ def task_update_nc():
 
				     }
			
 
				 
			
 
				 
			
 
				+
			
 
				+
			
 
				 def task_download_nc():
			
 
				     """ Download NC submissions """
			
 
				     return {
			
@@ -80,7 +82,42 @@ def task_download_nc():
 
				         # before download
			
 
				         'actions': ['datalad run -m "Download NC submissions" '
			
 
				                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
			
 
				-                    './venv/bin/python code/UNFCCC_downloader/download_nc.py'],
			
 
				+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+# annexI data: one update call for all data types (as they are on one page)
			
 
				+# but for each year separately.
			
 
				+# downloading is per year and
			
 
				+update_aI_config = {
			
 
				+    "year": get_var('year', None),
			
 
				+    "category": get_var('category', None),
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def task_update_annexi():
			
 
				+    """ Update list of annexI submissions """
			
 
				+    return {
			
 
				+        'targets': [f"downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv"],
			
 
				+        'actions': [f"datalad run -m 'Fetch AnnexI submissions for {update_aI_config['year']}'"
			
 
				+                    f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
			
 
				+                    f"./venv/bin/python code/UNFCCC_downloader/fetch_submissions_annexI.py "
			
 
				+                    f"--year={update_aI_config['year']}"],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def task_download_annexi():
			
 
				+    """ Download NC submissions """
			
 
				+    return {
			
 
				+        #'file_dep': ['downloaded_data/UNFCCC/submissions-nc.csv'],
			
 
				+        # deactivate file_dep fow now as it will always run fetch submissions
			
 
				+        # before download
			
 
				+        'actions': ['datalad run -m "Download NC submissions" '
			
 
				+                    '-i downloaded_data/UNFCCC/submissions-nc.csv '
			
 
				+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -103,6 +140,7 @@ read_config = {
 
				     "submission": get_var('submission', None),
			
 
				 }
			
 
				 
			
 
				+
			
 
				 def task_read_unfccc_submission():
			
 
				     """ Read submission for a country (if code exists) """
			
 
				     return {