3 years ago · 146033ed96
--- a/Makefile
+++ b/Makefile
@@ -1,28 +1,8 @@
 
															 .SILENT: help
														
 
															 help:
														
 
															 	echo Options:
														
 
															-	echo make update-bur: Update list of BUR submissions
														
 
															-	echo make download-bur: Download BUR submissions
														
 
															-	echo make update-nc: Update list of NC submissions
														
 
															-	echo make download-nc: Download NC submissions
														
 
															-	echo make download-ndc: Download NDC submissions
														
 
															 	echo make venv: create virtual environment
														
 
															-update-bur: venv
														
 
															-	datalad run -m "Fetch BUR submissions" -o downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_bur.py
														
 
															-
														
 
															-download-bur: venv
														
 
															-	datalad run -m "Download BUR submissions" -i downloaded_data/UNFCCC/submissions-bur.csv ./venv/bin/python code/UNFCCC_downloader/download_bur.py
														
 
															-
														
 
															-update-nc: venv
														
 
															-	datalad run -m "Fetch NC submissions" -o downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/fetch_submissions_nc.py
														
 
															-
														
 
															-download-nc: venv
														
 
															-	datalad run -m "Download NC submissions" -i downloaded_data/UNFCCC/submissions-nc.csv ./venv/bin/python code/UNFCCC_downloader/download_nc.py
														
 
															-
														
 
															-download-ndc: venv
														
 
															-	datalad run -m "Download NDC submissions" ./venv/bin/python code/UNFCCC_downloader/download_ndc.py
														
 
															-
														
 
															 venv: code/requirements.txt
														
 
															 	[ -d ./venv ] || python3 -m venv venv
														
 
															 	./venv/bin/pip install --upgrade pip
														
--- a/code/UNFCCC_downloader/download_annexI.py
+++ b/code/UNFCCC_downloader/download_annexI.py
@@ -0,0 +1,181 @@
 
															+import argparse
														
 
															+import pandas as pd
														
 
															+import requests
														
 
															+import shutil
														
 
															+import time
														
 
															+import os
														
 
															+import zipfile
														
 
															+from datetime import date
														
 
															+from selenium.webdriver import Firefox
														
 
															+from selenium.webdriver.firefox.options import Options
														
 
															+from random import randrange
														
 
															+from pathlib import Path
														
 
															+
														
 
															+root = Path(__file__).parents[2]
														
 
															+
														
 
															+###############
														
 
															+#
														
 
															+# TODO
														
 
															+# download directly via selenium see link below
														
 
															+# https://sqa.stackexchange.com/questions/2197/
														
 
															+# how-to-download-a-file-using-seleniums-webdriver
														
 
															+# for automatic downloading see https://stackoverflow.com/questions/70740163/
														
 
															+# python-selenium-firefox-driver-dismiss-open-save-file-popup
														
 
															+###############
														
 
															+
														
 
															+descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
														
 
															+        'Based on download.py from national-inventory-submissions ' \
														
 
															+        '(https://github.com/openclimatedata/national-inventory-submisions)'
														
 
															+parser = argparse.ArgumentParser(description=descr)
														
 
															+parser.add_argument(
														
 
															+    '--category',
														
 
															+    help='Category to download, CRF, NIR, SEF'
														
 
															+)
														
 
															+parser.add_argument(
														
 
															+    '--year',
														
 
															+    help='Year to download'
														
 
															+)
														
 
															+
														
 
															+args = parser.parse_args()
														
 
															+year = args.year
														
 
															+category = args.category.upper()
														
 
															+dataset = category + year
														
 
															+print(f"Downloading data for {dataset}")
														
 
															+
														
 
															+# generate the correct url
														
 
															+url = (
														
 
															+    "https://unfccc.int/process/transparency-and-reporting/"
														
 
															+    "reporting-and-review-under-the-convention/"
														
 
															+    "greenhouse-gas-inventories-annex-i-parties/"
														
 
															+    "submissions/national-inventory-submissions-{}".format(year)
														
 
															+)
														
 
															+
														
 
															+# TODO: years before 2019
														
 
															+if int(year) == 2019:
														
 
															+    url = (
														
 
															+           "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
														
 
															+           "reporting-and-review-under-the-convention/"
														
 
															+           "greenhouse-gas-inventories-annex-i-parties/"
														
 
															+           "national-inventory-submissions-{}".format(year)
														
 
															+          )
														
 
															+
														
 
															+if int(year) >= 2020:
														
 
															+    url = (
														
 
															+            "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
														
 
															+            )
														
 
															+
														
 
															+download_path = root / "downloaded_data" / "UNFCCC"
														
 
															+
														
 
															+error_file_sizes = [212, 210]
														
 
															+
														
 
															+# Read submissions list
														
 
															+submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
														
 
															+
														
 
															+# filter submissions list or category
														
 
															+items = submissions[submissions.Kind  == category.upper()]
														
 
															+
														
 
															+# set options for headless mode
														
 
															+profile_path = ".firefox"
														
 
															+options = Options()
														
 
															+#options.add_argument('-headless')
														
 
															+
														
 
															+# create profile for headless mode and automatic downloading
														
 
															+options.set_preference('profile', profile_path)
														
 
															+options.set_preference('browser.download.folderList', 2)
														
 
															+
														
 
															+# set up selenium driver
														
 
															+driver = Firefox(options=options)
														
 
															+# visit the main data page once to create cookies
														
 
															+driver.get(url)
														
 
															+
														
 
															+# wait a bit for the website to load before we get the cokkies
														
 
															+time.sleep(20)
														
 
															+
														
 
															+# get the session id cookie
														
 
															+cookies_selenium = driver.get_cookies()
														
 
															+cookies = {}
														
 
															+for cookie in cookies_selenium:
														
 
															+    cookies[cookie['name']] = cookie['value']
														
 
															+
														
 
															+new_downloaded = []
														
 
															+
														
 
															+for idx, submission in items.iterrows():
														
 
															+    print("=" * 60)
														
 
															+    title = submission.Title
														
 
															+    url = submission.URL
														
 
															+    country = submission.Country
														
 
															+    country = country.replace(' ', '_')
														
 
															+    print(f"Downloading {title} from {url}")
														
 
															+
														
 
															+    country_folder = download_path / country
														
 
															+    if not country_folder.exists():
														
 
															+        country_folder.mkdir()
														
 
															+    local_filename = \
														
 
															+        country_folder / dataset / \
														
 
															+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
														
 
															+    if not local_filename.parent.exists():
														
 
															+        local_filename.parent.mkdir()
														
 
															+
														
 
															+    if local_filename.exists():
														
 
															+        # check file size. if 210 or 212 bytes it's the error page
														
 
															+        if Path(local_filename).stat().st_size in error_file_sizes:
														
 
															+            # found the error page. delete file
														
 
															+            os.remove(local_filename)
														
 
															+    
														
 
															+    # now we have removed error pages, so a present file should not be overwritten
														
 
															+    if not local_filename.exists():
														
 
															+        i = 0  # reset counter
														
 
															+        while not local_filename.exists() and i < 10:
														
 
															+            # for i = 0 and i = 5 try to get a new session ID
														
 
															+            if i == 1 or i == 5:
														
 
															+                driver = webdriver.Firefox(options=options)
														
 
															+    
														
 
															+                # visit the main data page once to create cookies
														
 
															+                driver.get(url)
														
 
															+                time.sleep(20)
														
 
															+
														
 
															+                # get the session id cookie
														
 
															+                cookies_selenium = driver.get_cookies()
														
 
															+                cookies = {}
														
 
															+                for cookie in cookies_selenium:
														
 
															+                    cookies[cookie['name']] = cookie['value']
														
 
															+
														
 
															+            r = requests.get(url, stream=True, cookies=cookies)
														
 
															+            with open(str(local_filename), 'wb') as f:
														
 
															+                shutil.copyfileobj(r.raw, f)
														
 
															+            
														
 
															+            # check file size. if 210 or 212 bytes it's the error page
														
 
															+            if Path(local_filename).stat().st_size in error_file_sizes:
														
 
															+                # found the error page. delete file
														
 
															+                os.remove(local_filename)
														
 
															+            
														
 
															+            # sleep a bit to avoid running into captchas
														
 
															+            time.sleep(randrange(5, 15))
														
 
															+            
														
 
															+        if local_filename.exists():
														
 
															+            new_downloaded.append(submission)
														
 
															+            print(f"Download => {local_filename.relative_to(root)}")
														
 
															+            # unzip data (only for new downloads)
														
 
															+            if local_filename.suffix == ".zip":
														
 
															+                try:
														
 
															+                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
														
 
															+                    zipped_file.extractall(str(local_filename.parent))
														
 
															+                    print(f"Extracted {len(zipped_file.namelist())} files.")
														
 
															+                    zipped_file.close()
														
 
															+                # TODO Better error logging/visibilty
														
 
															+                except zipfile.BadZipFile:
														
 
															+                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
														
 
															+                except NotImplementedError:
														
 
															+                    print("Zip format not supported, please unzip on the command line.")
														
 
															+            else:
														
 
															+                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
														
 
															+        else:
														
 
															+            print(f"Failed to download {local_filename.relative_to(root)}")
														
 
															+
														
 
															+    else:
														
 
															+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
														
 
															+
														
 
															+driver.close()
														
 
															+
														
 
															+df = pd.DataFrame(new_downloaded)
														
 
															+df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
														
--- a/code/UNFCCC_downloader/download_bur.py
+++ b/code/UNFCCC_downloader/download_bur.py
@@ -1,146 +0,0 @@
 
															-import pandas as pd
														
 
															-import requests
														
 
															-import shutil
														
 
															-import time
														
 
															-import os
														
 
															-from datetime import date
														
 
															-from selenium.webdriver import Firefox
														
 
															-from selenium.webdriver.firefox.options import Options
														
 
															-from random import randrange
														
 
															-
														
 
															-from pathlib import Path
														
 
															-root = Path(__file__).parents[2]
														
 
															-"""
														
 
															-based on download_bur from national-inventory-submissions
														
 
															-# (https://github.com/openclimatedata/national-inventory-submisions)
														
 
															-"""
														
 
															-
														
 
															-###############
														
 
															-#
														
 
															-# TODO
														
 
															-# download directly via selenium see link below
														
 
															-# https://sqa.stackexchange.com/questions/2197/
														
 
															-# how-to-download-a-file-using-seleniums-webdriver
														
 
															-###############
														
 
															-
														
 
															-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
														
 
															-                          "submissions-bur.csv")
														
 
															-
														
 
															-url = "https://unfccc.int/BURs"
														
 
															-
														
 
															-# if we get files of this size they are error pages and we need to
														
 
															-# try the download again
														
 
															-error_file_sizes = [212, 210]
														
 
															-
														
 
															-# find which BUR submission rounds exist
														
 
															-present_BURs = submissions.Kind.unique()
														
 
															-
														
 
															-# Ensure download path and subfolders exist
														
 
															-download_path = root / "downloaded_data/UNFCCC"
														
 
															-if not download_path.exists():
														
 
															-    download_path.mkdir(parents=True)
														
 
															-
														
 
															-# set options for headless mode
														
 
															-profile_path = ".firefox"
														
 
															-options = Options()
														
 
															-#options.add_argument('-headless')
														
 
															-
														
 
															-# create profile for headless mode and automatic downloading
														
 
															-options.set_preference('profile', profile_path)
														
 
															-options.set_preference('browser.download.folderList', 2)
														
 
															-
														
 
															-# set up selenium driver
														
 
															-driver = Firefox(options=options)
														
 
															-# visit the main data page once to create cookies
														
 
															-driver.get(url)
														
 
															-time.sleep(20)
														
 
															-
														
 
															-# get the session id cookie
														
 
															-cookies_selenium = driver.get_cookies()
														
 
															-cookies = {}
														
 
															-for cookie in cookies_selenium:
														
 
															-    cookies[cookie['name']] = cookie['value']
														
 
															-
														
 
															-print(cookies)
														
 
															-
														
 
															-new_downloaded = []
														
 
															-
														
 
															-for idx, submission in submissions.iterrows():
														
 
															-    print("=" * 60)
														
 
															-    bur = submission.Kind
														
 
															-    title = submission.Title
														
 
															-    url = submission.URL
														
 
															-    country = submission.Country
														
 
															-    country = country.replace(' ', '_')
														
 
															-    print(title)
														
 
															-
														
 
															-    country_folder = download_path / country
														
 
															-    if not country_folder.exists():
														
 
															-        country_folder.mkdir()
														
 
															-    local_filename = country_folder / bur / url.split('/')[-1]
														
 
															-    local_filename_underscore = \
														
 
															-        download_path / country / bur / \
														
 
															-        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
														
 
															-    if not local_filename.parent.exists():
														
 
															-        local_filename.parent.mkdir()
														
 
															-
														
 
															-    ### remove, not needed as no legacy data present
														
 
															-    #if local_filename.exists():
														
 
															-    #    # rename
														
 
															-    #    local_filename.rename(local_filename_underscore)
														
 
															-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
														
 
															-
														
 
															-    # this should never be needed but in case anything goes wrong and
														
 
															-    # an error page is present it should be overwritten
														
 
															-    if local_filename_underscore.exists():
														
 
															-        # check file size. if 210 or 212 bytes it's the error page
														
 
															-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
														
 
															-            # found the error page. delete file
														
 
															-            os.remove(local_filename_underscore)
														
 
															-    
														
 
															-    # now we have remove error pages, so a present file should not be overwritten
														
 
															-    if not local_filename_underscore.exists():
														
 
															-        i = 0  # reset counter
														
 
															-        while not local_filename_underscore.exists() and i < 10:
														
 
															-            # for i = 0 and i = 5 try to get a new session ID
														
 
															-            if i == 1 or i == 5:
														
 
															-                driver = webdriver.Firefox(options=options,
														
 
															-                                           firefox_profile=profile)
														
 
															-    
														
 
															-                # visit the main data page once to create cookies
														
 
															-                driver.get(url)
														
 
															-                time.sleep(20)
														
 
															-                
														
 
															-                # get the session id cookie
														
 
															-                cookies_selenium = driver.get_cookies()
														
 
															-                cookies = {}
														
 
															-                for cookie in cookies_selenium:
														
 
															-                    cookies[cookie['name']] = cookie['value']
														
 
															-                    
														
 
															-            r = requests.get(url, stream=True, cookies=cookies)
														
 
															-            with open(str(local_filename_underscore), 'wb') as f:
														
 
															-                shutil.copyfileobj(r.raw, f)
														
 
															-            
														
 
															-            # check file size. if 210 or 212 bytes it's the error page
														
 
															-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
														
 
															-                # found the error page. delete file
														
 
															-                os.remove(local_filename_underscore)
														
 
															-            
														
 
															-            # sleep a bit to avoid running into captchas
														
 
															-            time.sleep(randrange(5, 15))
														
 
															-            
														
 
															-        if local_filename_underscore.exists():
														
 
															-            new_downloaded.append(submission)
														
 
															-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
														
 
															-                  "/" + local_filename_underscore.name)
														
 
															-        else:
														
 
															-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
														
 
															-                  + bur + "/" + local_filename_underscore.name)
														
 
															-
														
 
															-    else:
														
 
															-        print("=> Already downloaded " + local_filename_underscore.name)
														
 
															-
														
 
															-driver.close()
														
 
															-
														
 
															-df = pd.DataFrame(new_downloaded)
														
 
															-df.to_csv(download_path / "00_new_downloads_bur-{}.csv".format(date.today()), index=False)
														
--- a/code/UNFCCC_downloader/download_non-annexI.py
+++ b/code/UNFCCC_downloader/download_non-annexI.py
@@ -1,3 +1,4 @@
 
															+import argparse
														
 
															 import pandas as pd
														
 
															 import requests
														
 
															 import shutil
														
@@ -7,13 +8,9 @@ from datetime import date
 
															 from selenium.webdriver import Firefox
														
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															-
														
 
															 from pathlib import Path
														
 
															+
														
 
															 root = Path(__file__).parents[2]
														
 
															-"""
														
 
															-based on download_bur from national-inventory-submissions
														
 
															-# (https://github.com/openclimatedata/national-inventory-submisions)
														
 
															-"""
														
 
															 ###############
														
 
															 #
														
@@ -25,19 +22,31 @@ based on download_bur from national-inventory-submissions
 
															 # python-selenium-firefox-driver-dismiss-open-save-file-popup
														
 
															 ###############
														
 
															-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
														
 
															-                          "submissions-nc.csv")
														
 
															+descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
														
 
															+        'Based on download_bur.py from national-inventory-submissions ' \
														
 
															+        '(https://github.com/openclimatedata/national-inventory-submisions)'
														
 
															+parser = argparse.ArgumentParser(description=descr)
														
 
															+parser.add_argument(
														
 
															+    '--category',
														
 
															+    help='Category to download: BUR, NC'
														
 
															+)
														
 
															-url = "https://unfccc.int/non-annex-I-NCs"
														
 
															+args = parser.parse_args()
														
 
															+category = args.category.upper()
														
 
															+print(f"Downloading {category} submissions")
														
 
															+
														
 
															+if category == "BUR":
														
 
															+    url = "https://unfccc.int/BURs"
														
 
															+else:
														
 
															+    url = "https://unfccc.int/non-annex-I-NCs"
														
 
															 # if we get files of this size they are error pages and we need to
														
 
															 # try the download again
														
 
															 error_file_sizes = [212, 210]
														
 
															-# Ensure download path and subfolders exist
														
 
															+# Read submissions list
														
 
															 download_path = root / "downloaded_data" / "UNFCCC"
														
 
															-if not download_path.exists():
														
 
															-    download_path.mkdir(parents=True)
														
 
															+submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
														
 
															 # set options for headless mode
														
 
															 profile_path = ".firefox"
														
@@ -53,6 +62,7 @@ driver = Firefox(options=options)
 
															 # visit the main data page once to create cookies
														
 
															 driver.get(url)
														
 
															+# wait a bit for the website to load before we get the cookies
														
 
															 time.sleep(20)
														
 
															 # get the session id cookie
														
@@ -61,86 +71,72 @@ cookies = {}
 
															 for cookie in cookies_selenium:
														
 
															     cookies[cookie['name']] = cookie['value']
														
 
															-print(cookies)
														
 
															-
														
 
															 new_downloaded = []
														
 
															 for idx, submission in submissions.iterrows():
														
 
															     print("=" * 60)
														
 
															-    bur = submission.Kind
														
 
															+    kind = submission.Kind
														
 
															     title = submission.Title
														
 
															     url = submission.URL
														
 
															     country = submission.Country
														
 
															     country = country.replace(' ', '_')
														
 
															-    print(title)
														
 
															+    print(f"Downloading {title} from {url}")
														
 
															     country_folder = download_path / country
														
 
															     if not country_folder.exists():
														
 
															         country_folder.mkdir()
														
 
															-    local_filename = country_folder / bur / url.split('/')[-1]
														
 
															-    local_filename_underscore = \
														
 
															-        download_path / country / bur / \
														
 
															+    local_filename = \
														
 
															+        country_folder / kind / \
														
 
															         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
														
 
															     if not local_filename.parent.exists():
														
 
															         local_filename.parent.mkdir()
														
 
															-    ### remove, not needed as no legacy data present
														
 
															-    #if local_filename.exists():
														
 
															-    #    # rename
														
 
															-    #    local_filename.rename(local_filename_underscore)
														
 
															-    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)
														
 
															-
														
 
															-    # this should never be needed but in case anything goes wrong and
														
 
															-    # an error page is present it should be overwritten
														
 
															-    if local_filename_underscore.exists():
														
 
															+    if local_filename.exists():
														
 
															         # check file size. if 210 or 212 bytes it's the error page
														
 
															-        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
														
 
															+        if Path(local_filename).stat().st_size in error_file_sizes:
														
 
															             # found the error page. delete file
														
 
															-            os.remove(local_filename_underscore)
														
 
															+            os.remove(local_filename)
														
 
															-    # now we have remove error pages, so a present file should not be overwritten
														
 
															-    if not local_filename_underscore.exists():
														
 
															+    # now we have removed error pages, so a present file should not be overwritten
														
 
															+    if not local_filename.exists():
														
 
															         i = 0  # reset counter
														
 
															-        while not local_filename_underscore.exists() and i < 10:
														
 
															+        while not local_filename.exists() and i < 10:
														
 
															             # for i = 0 and i = 5 try to get a new session ID
														
 
															             if i == 1 or i == 5:
														
 
															-                driver = webdriver.Firefox(options=options,
														
 
															-                                           firefox_profile=profile)
														
 
															+                driver = Firefox(options=options)
														
 
															                 # visit the main data page once to create cookies
														
 
															                 driver.get(url)
														
 
															                 time.sleep(20)
														
 
															-                
														
 
															+
														
 
															                 # get the session id cookie
														
 
															                 cookies_selenium = driver.get_cookies()
														
 
															                 cookies = {}
														
 
															                 for cookie in cookies_selenium:
														
 
															                     cookies[cookie['name']] = cookie['value']
														
 
															-                    
														
 
															+
														
 
															             r = requests.get(url, stream=True, cookies=cookies)
														
 
															-            with open(str(local_filename_underscore), 'wb') as f:
														
 
															+            with open(str(local_filename), 'wb') as f:
														
 
															                 shutil.copyfileobj(r.raw, f)
														
 
															             # check file size. if 210 or 212 bytes it's the error page
														
 
															-            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
														
 
															+            if Path(local_filename).stat().st_size in error_file_sizes:
														
 
															                 # found the error page. delete file
														
 
															-                os.remove(local_filename_underscore)
														
 
															+                os.remove(local_filename)
														
 
															             # sleep a bit to avoid running into captchas
														
 
															             time.sleep(randrange(5, 15))
														
 
															-        if local_filename_underscore.exists():
														
 
															+        if local_filename.exists():
														
 
															             new_downloaded.append(submission)
														
 
															-            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
														
 
															-                  "/" + local_filename_underscore.name)
														
 
															+            print(f"Download => {local_filename.relative_to(root)}")
														
 
															         else:
														
 
															-            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
														
 
															-                  + bur + "/" + local_filename_underscore.name)
														
 
															+            print(f"Failed to download {local_filename.relative_to(root)}")
														
 
															     else:
														
 
															-        print("=> Already downloaded " + local_filename_underscore.name)
														
 
															+        print(f"=> Already downloaded {local_filename.relative_to(root)}")
														
 
															 driver.close()
														
 
															 df = pd.DataFrame(new_downloaded)
														
 
															-df.to_csv(download_path / "00_new_downloads_nc-{}.csv".format(date.today()), index=False)
														
 
															+df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
														
--- a/code/UNFCCC_downloader/fetch_submissions_annexI.py
+++ b/code/UNFCCC_downloader/fetch_submissions_annexI.py
@@ -0,0 +1,140 @@
 
															+import argparse
														
 
															+import time
														
 
															+import pandas as pd
														
 
															+
														
 
															+from pathlib import Path
														
 
															+from bs4 import BeautifulSoup
														
 
															+from selenium.webdriver import Firefox
														
 
															+from selenium.webdriver.firefox.options import Options
														
 
															+from random import randrange
														
 
															+from unfccc_submission_info import get_unfccc_submission_info
														
 
															+
														
 
															+root = Path(__file__).absolute().parents[2]
														
 
															+
														
 
															+max_tries = 10
														
 
															+
														
 
															+descr = ("Download UNFCCC National Inventory Submissions lists "
														
 
															+         "and create list of submissions as CSV file. Based on "
														
 
															+         "process.py from national-inventory-submissions "
														
 
															+         "(https://github.com/openclimatedata/national-inventory-submisions)")
														
 
															+parser = argparse.ArgumentParser(description=descr)
														
 
															+parser.add_argument(
														
 
															+    '--year',
														
 
															+    help='Year to download'
														
 
															+)
														
 
															+
														
 
															+args = parser.parse_args()
														
 
															+year = args.year
														
 
															+
														
 
															+print("Fetching submissions for {}".format(year))
														
 
															+
														
 
															+url = (
														
 
															+    "https://unfccc.int/process/transparency-and-reporting/"
														
 
															+    "reporting-and-review-under-the-convention/"
														
 
															+    "greenhouse-gas-inventories-annex-i-parties/"
														
 
															+    "submissions/national-inventory-submissions-{}".format(year)
														
 
															+)
														
 
															+
														
 
															+if int(year) == 2019:
														
 
															+    url = (
														
 
															+        "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
														
 
															+        "reporting-and-review-under-the-convention/"
														
 
															+        "greenhouse-gas-inventories-annex-i-parties/"
														
 
															+        "national-inventory-submissions-{}".format(year)
														
 
															+    )
														
 
															+
														
 
															+if int(year) >= 2020:
														
 
															+    url = (
														
 
															+        "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
														
 
															+    )
														
 
															+
														
 
															+print(f"Using {url} to get submissions list")
														
 
															+
														
 
															+# set options for headless mode
														
 
															+profile_path = ".firefox"
														
 
															+options = Options()
														
 
															+options.add_argument('-headless')
														
 
															+
														
 
															+# create profile for headless mode and automatic downloading
														
 
															+options.set_preference('profile', profile_path)
														
 
															+
														
 
															+# set up selenium driver
														
 
															+driver = Firefox(options=options)
														
 
															+driver.get(url)
														
 
															+
														
 
															+html = BeautifulSoup(driver.page_source, "html.parser")
														
 
															+
														
 
															+table = html.find("table")
														
 
															+
														
 
															+# check if table found. if not the get command didn't work, likely because of a captcha on the site
														
 
															+### TODO replace by error message
														
 
															+if not table:
														
 
															+    # try to load html file from disk
														
 
															+    print('Download failed, trying to load manually downloaded file')
														
 
															+    file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
														
 
															+    content = file.read()
														
 
															+    html = BeautifulSoup(content, "html.parser")
														
 
															+    table = html.find("table")
														
 
															+    if not table:
														
 
															+        print(
														
 
															+            "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
														
 
															+            " not found")
														
 
															+        exit()
														
 
															+
														
 
															+links = table.findAll('a')
														
 
															+
														
 
															+targets = []  # sub-pages
														
 
															+downloads = []
														
 
															+no_downloads = []
														
 
															+
														
 
															+# Check links for Zipfiles or subpages
														
 
															+for link in links:
														
 
															+    if "href" not in link.attrs:
														
 
															+        continue
														
 
															+    href = link.attrs["href"]
														
 
															+    if "/documents/" in href:
														
 
															+        if "title" in link.attrs.keys():
														
 
															+            title = link.attrs["title"]
														
 
															+        else:
														
 
															+            title = link.contents[0]
														
 
															+        if href.startswith("/documents"):
														
 
															+            href = "https://unfccc.int" + href
														
 
															+        # Only add pages in the format https://unfccc.int/documents/65587
														
 
															+        # to further downloads
														
 
															+        if str(Path(href).parent).endswith("documents"):
														
 
															+            targets.append({"title": title, "url": href})
														
 
															+    elif href.endswith(".zip"):
														
 
															+        if href.startswith("/files"):
														
 
															+            href = "https://unfccc.int" + href
														
 
															+        country = Path(href).name.split("-")[0].upper()
														
 
															+        title = f"{country} {link.contents[0]}"
														
 
															+        filename = Path(href).name
														
 
															+        file_parts = filename.split('-')
														
 
															+        if len(file_parts) >= 2:
														
 
															+            kind = file_parts[2].upper()
														
 
															+        elif filename.startswith('asr'):
														
 
															+            kind = 'CRF'
														
 
															+        else:
														
 
															+            kind = None
														
 
															+
														
 
															+        print("\t".join([kind, country, title, href]))
														
 
															+        downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
														
 
															+
														
 
															+# Go through sub-pages.
														
 
															+for target in targets:
														
 
															+    time.sleep(randrange(5, 15))
														
 
															+    url = target["url"]
														
 
															+
														
 
															+    submission_info = get_unfccc_submission_info(url, driver, 10)
														
 
															+
														
 
															+    if submission_info:
														
 
															+        downloads = downloads + submission_info
														
 
															+    else:
														
 
															+        no_downloads.append({target["title"], url})
														
 
															+
														
 
															+if len(no_downloads) > 0:
														
 
															+    print("No downloads for ", no_downloads)
														
 
															+
														
 
															+driver.close()
														
 
															+df = pd.DataFrame(downloads)
														
 
															+df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
														
--- a/code/UNFCCC_downloader/fetch_submissions_bur.py
+++ b/code/UNFCCC_downloader/fetch_submissions_bur.py
@@ -8,8 +8,9 @@ from bs4 import BeautifulSoup
 
															 from selenium.webdriver import Firefox
														
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															+from unfccc_submission_info import get_unfccc_submission_info
														
 
															-root = Path(__file__).parents[2]
														
 
															+root = Path(__file__).absolute().parents[2]
														
 
															 """
														
 
															 Download UNFCCC Biennial Update Report submissions
														
@@ -68,57 +69,14 @@ pattern = re.compile(r"BUR ?\d")
 
															 for target in targets:
														
 
															     time.sleep(randrange(5, 15))
														
 
															     url = target["url"]
														
 
															-    #subpage = requests.get(url, timeout=15.5)
														
 
															-    driver.get(url)
														
 
															-    html = BeautifulSoup(driver.page_source, "html.parser")
														
 
															-    title = html.find("h1").contents[0]
														
 
															-    match = pattern.search(title)
														
 
															-    if match:
														
 
															-        kind = match.group(0).replace(" ", "")
														
 
															-    else:
														
 
															-        kind = None
														
 
															-
														
 
															-
														
 
															-    h2 = html.find("h2", text="Versions")
														
 
															-    if h2:
														
 
															-        div = h2.findNext("div")
														
 
															-        links = div.findAll("a")
														
 
															-        try:
														
 
															-            country = (
														
 
															-                html.find("h2", text="Countries").findNext("div").findNext("div").text
														
 
															-            )
														
 
															-        except AttributeError:
														
 
															-            country = (
														
 
															-                html.find("h2", text="Corporate Author")
														
 
															-                .findNext("div")
														
 
															-                .findNext("div")
														
 
															-                .text
														
 
															-            )
														
 
															-        doctype = (
														
 
															-            html.find("h2", text="Document Type").findNext("div").findNext("div").text
														
 
															-        )
														
 
															-        for link in links:
														
 
															-            url = link.attrs["href"]
														
 
															-            if not kind:
														
 
															-                match = pattern.search(url.upper())
														
 
															-                if match:
														
 
															-                    kind = match.group(0)
														
 
															-                else:
														
 
															-                    if ("NIR" in doctype) or ("NIR" in title):
														
 
															-                        kind = "NIR"
														
 
															-                    elif "NC" in title:
														
 
															-                        kind = "NC"
														
 
															-            downloads.append(
														
 
															-                {
														
 
															-                    "Kind": kind,
														
 
															-                    "Country": country,
														
 
															-                    "Title": title,
														
 
															-                    "URL": url,
														
 
															-                }
														
 
															-            )
														
 
															-        print("\t".join([kind, country, title, url]))
														
 
															+
														
 
															+    submission_info = get_unfccc_submission_info(url, driver, 10)
														
 
															+
														
 
															+    if submission_info:
														
 
															+        downloads = downloads + submission_info
														
 
															     else:
														
 
															-        no_downloads.append((title, url))
														
 
															+        no_downloads.append({target["title"], url})
														
 
															+
														
 
															 if len(no_downloads) > 0:
														
 
															     print("No downloads for ", no_downloads)
														
--- a/code/UNFCCC_downloader/fetch_submissions_nc.py
+++ b/code/UNFCCC_downloader/fetch_submissions_nc.py
@@ -9,7 +9,7 @@ from selenium.webdriver import Firefox
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															-root = Path(__file__).parents[2]
														
 
															+root = Path(__file__).absolute().parents[2]
														
 
															 """
														
 
															 Download UNFCCC Biennial Update Report submissions
														
--- a/code/UNFCCC_downloader/unfccc_submission_info.py
+++ b/code/UNFCCC_downloader/unfccc_submission_info.py
@@ -0,0 +1,89 @@
 
															+# helper functions to gather submission info from UNFCCC website
														
 
															+import time
														
 
															+import re
														
 
															+from random import randrange
														
 
															+from typing import Dict, List
														
 
															+from selenium.webdriver import Firefox
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															+
														
 
															+def get_unfccc_submission_info(
														
 
															+        url: str,
														
 
															+        driver: Firefox,
														
 
															+        max_tries: int=10,
														
 
															+
														
 
															+) -> List[Dict[str,str]]:
														
 
															+    info = []
														
 
															+    pattern = re.compile(r"BUR ?\d")
														
 
															+    i = 0
														
 
															+    while i < max_tries:
														
 
															+        try:
														
 
															+            driver.get(url)
														
 
															+            html = BeautifulSoup(driver.page_source, "html.parser")
														
 
															+            title = html.find("h1").contents[0]
														
 
															+            break
														
 
															+        except AttributeError:
														
 
															+            print(f"Error fetching {url}")
														
 
															+            print("Retrying ...")
														
 
															+            time.sleep(randrange(5, 15))
														
 
															+            i += 1
														
 
															+            continue
														
 
															+
														
 
															+    if i == max_tries:
														
 
															+        print(f"Aborting after {max_tries} tries")
														
 
															+    else:
														
 
															+        match = pattern.search(title)
														
 
															+        if match:
														
 
															+            kind = match.group(0).replace(" ", "")
														
 
															+        else:
														
 
															+            kind = None
														
 
															+
														
 
															+        h2 = html.find("h2", text="Versions")
														
 
															+        if h2:
														
 
															+            div = h2.findNext("div")
														
 
															+            links = div.findAll("a")
														
 
															+            try:
														
 
															+                country = (
														
 
															+                    html.find("h2", text="Countries").findNext("div").findNext("div").text
														
 
															+                )
														
 
															+            except AttributeError:
														
 
															+                country = (
														
 
															+                    html.find("h2", text="Corporate Author")
														
 
															+                    .findNext("div")
														
 
															+                    .findNext("div")
														
 
															+                    .text
														
 
															+                )
														
 
															+            doctype = (
														
 
															+                html.find("h2", text="Document Type").findNext("div").findNext("div").text
														
 
															+            )
														
 
															+            for link in links:
														
 
															+                url = link.attrs["href"]
														
 
															+                if not kind:
														
 
															+                    match = pattern.search(url.upper())
														
 
															+                    if match:
														
 
															+                        kind = match.group(0)
														
 
															+                    else:
														
 
															+                        if ("CRF" in doctype) or ("CRF" in title):
														
 
															+                            kind = "CRF"
														
 
															+                        elif ("SEF" in doctype) or ("SEF" in title):
														
 
															+                            kind = "SEF"
														
 
															+                        elif ("NIR" in doctype) or ("NIR" in title):
														
 
															+                            kind = "NIR"
														
 
															+                        elif "NC" in title:
														
 
															+                            kind = "NC"
														
 
															+                        elif "Status report" in title:
														
 
															+                            kind = "CRF"
														
 
															+                        else:
														
 
															+                            kind = "other"
														
 
															+                info.append({
														
 
															+                    "Kind": kind,
														
 
															+                    "Country": country,
														
 
															+                    "Title": title,
														
 
															+                    "URL": url,
														
 
															+                })
														
 
															+
														
 
															+            print("\t".join([kind, country, title, url]))
														
 
															+        else:
														
 
															+            print(f"No files found for {url}")
														
 
															+
														
 
															+    return info
														
--- a/dodo.py
+++ b/dodo.py
@@ -54,7 +54,7 @@ def task_download_bur():
 
															         # before download
														
 
															         'actions': ['datalad run -m "Download BUR submissions" '
														
 
															                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
														
 
															-                    './venv/bin/python code/UNFCCC_downloader/download_bur.py'],
														
 
															+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=BUR.py'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -72,6 +72,8 @@ def task_update_nc():
 
															     }
														
 
															+
														
 
															+
														
 
															 def task_download_nc():
														
 
															     """ Download NC submissions """
														
 
															     return {
														
@@ -80,7 +82,42 @@ def task_download_nc():
 
															         # before download
														
 
															         'actions': ['datalad run -m "Download NC submissions" '
														
 
															                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
														
 
															-                    './venv/bin/python code/UNFCCC_downloader/download_nc.py'],
														
 
															+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
														
 
															+        'verbosity': 2,
														
 
															+        'setup': ['setup_venv'],
														
 
															+    }
														
 
															+
														
 
															+# annexI data: one update call for all data types (as they are on one page)
														
 
															+# but for each year separately.
														
 
															+# downloading is per year and
														
 
															+update_aI_config = {
														
 
															+    "year": get_var('year', None),
														
 
															+    "category": get_var('category', None),
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def task_update_annexi():
														
 
															+    """ Update list of annexI submissions """
														
 
															+    return {
														
 
															+        'targets': [f"downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv"],
														
 
															+        'actions': [f"datalad run -m 'Fetch AnnexI submissions for {update_aI_config['year']}'"
														
 
															+                    f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
														
 
															+                    f"./venv/bin/python code/UNFCCC_downloader/fetch_submissions_annexI.py "
														
 
															+                    f"--year={update_aI_config['year']}"],
														
 
															+        'verbosity': 2,
														
 
															+        'setup': ['setup_venv'],
														
 
															+    }
														
 
															+
														
 
															+
														
 
															+def task_download_annexi():
														
 
															+    """ Download NC submissions """
														
 
															+    return {
														
 
															+        #'file_dep': ['downloaded_data/UNFCCC/submissions-nc.csv'],
														
 
															+        # deactivate file_dep fow now as it will always run fetch submissions
														
 
															+        # before download
														
 
															+        'actions': ['datalad run -m "Download NC submissions" '
														
 
															+                    '-i downloaded_data/UNFCCC/submissions-nc.csv '
														
 
															+                    './venv/bin/python code/UNFCCC_downloader/download_non-annexI.py --category=NC'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -103,6 +140,7 @@ read_config = {
 
															     "submission": get_var('submission', None),
														
 
															 }
														
 
															+
														
 
															 def task_read_unfccc_submission():
														
 
															     """ Read submission for a country (if code exists) """
														
 
															     return {