1 anno fa · 34660b7f47
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_btr.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_btr.py
@@ -0,0 +1,157 @@
 
				+import argparse
			
 
				+import pandas as pd
			
 
				+import requests
			
 
				+import shutil
			
 
				+import time
			
 
				+import os
			
 
				+import zipfile
			
 
				+from datetime import date
			
 
				+from selenium.webdriver import Firefox
			
 
				+from selenium.webdriver.firefox.options import Options
			
 
				+from random import randrange
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
			
 
				+from unfccc_submission_info import get_BTR_name_and_URL
			
 
				+
			
 
				+###############
			
 
				+#
			
 
				+# TODO
			
 
				+# download directly via selenium see link below
			
 
				+# https://sqa.stackexchange.com/questions/2197/
			
 
				+# how-to-download-a-file-using-seleniums-webdriver
			
 
				+# for automatic downloading see https://stackoverflow.com/questions/70740163/
			
 
				+# python-selenium-firefox-driver-dismiss-open-save-file-popup
			
 
				+###############
			
 
				+
			
 
				+descr = 'Download and unzip data from UNFCCC Biannial Transparency Reports Submissions. ' \
			
 
				+        'Based on download.py from national-inventory-submissions ' \
			
 
				+        '(https://github.com/openclimatedata/national-inventory-submisions)'
			
 
				+parser = argparse.ArgumentParser(description=descr)
			
 
				+
			
 
				+parser.add_argument(
			
 
				+    '--round',
			
 
				+    help='Submission round to download, e.g. 1'
			
 
				+)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+submission_round = int(args.round)
			
 
				+
			
 
				+round_name, url = get_BTR_name_and_URL(submission_round)
			
 
				+dataset = f"BTR{submission_round}"
			
 
				+
			
 
				+print(f"Downloading data for {round_name} BTRs")
			
 
				+
			
 
				+error_file_sizes = [212, 210]
			
 
				+
			
 
				+# Read submissions list
			
 
				+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{dataset}.csv")
			
 
				+
			
 
				+# set options for headless mode
			
 
				+profile_path = ".firefox"
			
 
				+options = Options()
			
 
				+#options.add_argument('-headless')
			
 
				+
			
 
				+# create profile for headless mode and automatic downloading
			
 
				+options.set_preference('profile', profile_path)
			
 
				+options.set_preference('browser.download.folderList', 2)
			
 
				+
			
 
				+# set up selenium driver
			
 
				+driver = Firefox(options=options)
			
 
				+# visit the main data page once to create cookies
			
 
				+driver.get(url)
			
 
				+
			
 
				+# wait a bit for the website to load before we get the cookies
			
 
				+time.sleep(20)
			
 
				+
			
 
				+# get the session id cookie
			
 
				+cookies_selenium = driver.get_cookies()
			
 
				+cookies = {}
			
 
				+for cookie in cookies_selenium:
			
 
				+    cookies[cookie['name']] = cookie['value']
			
 
				+
			
 
				+new_downloaded = []
			
 
				+
			
 
				+for idx, submission in submissions.iterrows():
			
 
				+    print("=" * 60)
			
 
				+    title = submission.Title
			
 
				+    url = submission.URL
			
 
				+    country = submission.Country
			
 
				+    country = country.replace(' ', '_')
			
 
				+    print(f"Downloading {title} from {url}")
			
 
				+
			
 
				+    country_folder = downloaded_data_path_UNFCCC / country
			
 
				+    if not country_folder.exists():
			
 
				+        country_folder.mkdir()
			
 
				+    local_filename = \
			
 
				+        country_folder / dataset / \
			
 
				+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				+    if not local_filename.parent.exists():
			
 
				+        local_filename.parent.mkdir()
			
 
				+
			
 
				+    if local_filename.exists():
			
 
				+        # check file size. if 210 or 212 bytes it's the error page
			
 
				+        if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				+            # found the error page. delete file
			
 
				+            os.remove(local_filename)
			
 
				+    
			
 
				+    # now we have removed error pages, so a present file should not be overwritten
			
 
				+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
			
 
				+        i = 0  # reset counter
			
 
				+        while not local_filename.exists() and i < 10:
			
 
				+            # for i = 0 and i = 5 try to get a new session ID
			
 
				+            if i == 1 or i == 5:
			
 
				+                driver = Firefox(options=options)
			
 
				+    
			
 
				+                # visit the main data page once to create cookies
			
 
				+                driver.get(url)
			
 
				+                time.sleep(20)
			
 
				+
			
 
				+                # get the session id cookie
			
 
				+                cookies_selenium = driver.get_cookies()
			
 
				+                cookies = {}
			
 
				+                for cookie in cookies_selenium:
			
 
				+                    cookies[cookie['name']] = cookie['value']
			
 
				+
			
 
				+            r = requests.get(url, stream=True, cookies=cookies)
			
 
				+            with open(str(local_filename), 'wb') as f:
			
 
				+                shutil.copyfileobj(r.raw, f)
			
 
				+            
			
 
				+            # check file size. if 210 or 212 bytes it's the error page
			
 
				+            if Path(local_filename).stat().st_size in error_file_sizes:
			
 
				+                # found the error page. delete file
			
 
				+                os.remove(local_filename)
			
 
				+            
			
 
				+            # sleep a bit to avoid running into captchas
			
 
				+            time.sleep(randrange(5, 15))
			
 
				+            
			
 
				+        if local_filename.exists():
			
 
				+            new_downloaded.append(submission)
			
 
				+            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				+            # unzip data (only for new downloads)
			
 
				+            if local_filename.suffix == ".zip":
			
 
				+                try:
			
 
				+                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
			
 
				+                    zipped_file.extractall(str(local_filename.parent))
			
 
				+                    print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				+                    zipped_file.close()
			
 
				+                # TODO Better error logging/visibilty
			
 
				+                except zipfile.BadZipFile:
			
 
				+                    print(f"Error while trying to extract "
			
 
				+                          f"{local_filename.relative_to(root_path)}")
			
 
				+                except NotImplementedError:
			
 
				+                    print("Zip format not supported, please unzip on the command line.")
			
 
				+            else:
			
 
				+                print(f"Not attempting to extract "
			
 
				+                      f"{local_filename.relative_to(root_path)}.")
			
 
				+        else:
			
 
				+            print(f"Failed to download {local_filename.relative_to(root_path)}")
			
 
				+
			
 
				+    else:
			
 
				+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
			
 
				+
			
 
				+driver.close()
			
 
				+
			
 
				+df = pd.DataFrame(new_downloaded)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC
			
 
				+          / f"00_new_downloads_{dataset}-{date.today()}.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_btr.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_btr.py
@@ -0,0 +1,97 @@
 
				+import argparse
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from bs4 import BeautifulSoup
			
 
				+from selenium.webdriver import Firefox
			
 
				+from selenium.webdriver.firefox.options import Options
			
 
				+from random import randrange
			
 
				+from unfccc_submission_info import (get_unfccc_submission_info,
			
 
				+                                    get_BTR_name_and_URL)
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				+
			
 
				+max_tries = 10
			
 
				+
			
 
				+descr = ("Download UNFCCC Biannial Transparency Reports Submissions lists "
			
 
				+         "and create list of submissions as CSV file. Based on "
			
 
				+         "process.py from national-inventory-submissions "
			
 
				+         "(https://github.com/openclimatedata/national-inventory-submisions)")
			
 
				+parser = argparse.ArgumentParser(description=descr)
			
 
				+parser.add_argument(
			
 
				+    '--round',
			
 
				+    help='1 for first BTRs, 2 for second BTRs etc.'
			
 
				+)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+submission_round = int(args.round)
			
 
				+
			
 
				+round_name, url = get_BTR_name_and_URL(submission_round)
			
 
				+
			
 
				+print(f"Fetching submissions for {round_name} BTRs")
			
 
				+print(f"Using {url} to get submissions list")
			
 
				+
			
 
				+# set options for headless mode
			
 
				+profile_path = ".firefox"
			
 
				+options = Options()
			
 
				+options.add_argument('-headless')
			
 
				+
			
 
				+# create profile for headless mode and automatic downloading
			
 
				+options.set_preference('profile', profile_path)
			
 
				+
			
 
				+# set up selenium driver
			
 
				+driver = Firefox(options=options)
			
 
				+driver.get(url)
			
 
				+
			
 
				+html = BeautifulSoup(driver.page_source, "html.parser")
			
 
				+
			
 
				+table = html.find("table")
			
 
				+
			
 
				+# check if table found. if not the get command didn't work, likely because of a captcha on the site
			
 
				+### TODO replace by error message
			
 
				+if not table:
			
 
				+    raise RuntimeError('No table found on URL. Possibly due to a captcha.')
			
 
				+
			
 
				+links = table.findAll('a')
			
 
				+
			
 
				+targets = []  # sub-pages
			
 
				+downloads = []
			
 
				+no_downloads = []
			
 
				+
			
 
				+# Check links for Zipfiles or subpages
			
 
				+for link in links:
			
 
				+    if "href" not in link.attrs:
			
 
				+        continue
			
 
				+    href = link.attrs["href"]
			
 
				+    if "/documents/" in href:
			
 
				+        if "title" in link.attrs.keys():
			
 
				+            title = link.attrs["title"]
			
 
				+        else:
			
 
				+            title = link.contents[0]
			
 
				+        if href.startswith("/documents"):
			
 
				+            href = "https://unfccc.int" + href
			
 
				+        # Only add pages in the format https://unfccc.int/documents/65587
			
 
				+        # to further downloads
			
 
				+        if str(Path(href).parent).endswith("documents"):
			
 
				+            targets.append({"title": title, "url": href})
			
 
				+    else:
			
 
				+        print(f"Ignored link: {href}: not in the right format.")
			
 
				+
			
 
				+# Go through sub-pages.
			
 
				+for target in targets:
			
 
				+    time.sleep(randrange(5, 15))
			
 
				+    url = target["url"]
			
 
				+
			
 
				+    submission_info = get_unfccc_submission_info(url, driver, 10)
			
 
				+
			
 
				+    if submission_info:
			
 
				+        downloads = downloads + submission_info
			
 
				+    else:
			
 
				+        no_downloads.append({target["title"], url})
			
 
				+
			
 
				+if len(no_downloads) > 0:
			
 
				+    print("No downloads for ", no_downloads)
			
 
				+
			
 
				+driver.close()
			
 
				+df = pd.DataFrame(downloads)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-BTR{submission_round}.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/unfccc_submission_info.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/unfccc_submission_info.py
@@ -82,16 +82,14 @@ def get_unfccc_submission_info(
 
				                         if match:
			
 
				                             kind = match.group(0).replace(" ", "")
			
 
				                         else:
			
 
				-                            if ("CRF" in doctype) or ("CRF" in title):
			
 
				-                                kind = "CRF"
			
 
				-                            elif ("SEF" in doctype) or ("SEF" in title):
			
 
				-                                kind = "SEF"
			
 
				+                            if ("CRT" in doctype) or ("CRT" in title):
			
 
				+                                kind = "CRT"
			
 
				+                            elif ("NID" in doctype) or ("NID" in title):
			
 
				+                                kind = "NID"
			
 
				                             elif ("NIR" in doctype) or ("NIR" in title):
			
 
				                                 kind = "NIR"
			
 
				-                            elif "NC" in title:
			
 
				-                                kind = "NC"
			
 
				-                            elif "Status report" in title:
			
 
				-                                kind = "CRF"
			
 
				+                            elif ("BRT" in doctype) or ("BTR" in title):
			
 
				+                                kind = "BTR"
			
 
				                             else:
			
 
				                                 kind = "other"
			
 
				                 info.append({
			
@@ -106,3 +104,28 @@ def get_unfccc_submission_info(
 
				             print(f"No files found for {url}")
			
 
				 
			
 
				     return info
			
 
				+
			
 
				+
			
 
				+def get_BTR_name_and_URL(submission_round: int) -> (str, str):
			
 
				+    """
			
 
				+        Get the name and URL of a BTR for a given number
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    submission_round (int)
			
 
				+        submission_round of the BTRs e.g. 1
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    name (str): name of the BTR submission round, e.g. 'first'
			
 
				+    URL (str): URL of the submission page on the UNFCCC website
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    if submission_round == 1:
			
 
				+        name = "first"
			
 
				+        URL = "https://unfccc.int/first-biennial-transparency-reports"
			
 
				+    else:
			
 
				+        raise ValueError(f"Submission round {submission_round} is not defined")
			
 
				+
			
 
				+    return name, URL
			
--- a/dodo.py
+++ b/dodo.py
@@ -156,6 +156,49 @@ def task_download_annexi():
 
				     }
			
 
				 
			
 
				 
			
 
				+# annexI data: one update call for all data types (as they are on one page)
			
 
				+# but for each year separately.
			
 
				+# downloading is per year and
			
 
				+update_btr_config = {
			
 
				+    "round": get_var('round', None),
			
 
				+}
			
 
				+
			
 
				+def task_update_btr():
			
 
				+    """ Update list of BTR submissions """
			
 
				+    return {
			
 
				+        'targets': [f"downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv"],
			
 
				+        'actions': [f"datalad run -m 'Fetch Biannial Transparency Report submissions for BTR{update_btr_config['round']}' "
			
 
				+                    "--explicit "
			
 
				+                    f"-o downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_btr.py "
			
 
				+                    f"--round={update_btr_config['round']}"],
			
 
				+        'task_dep': ['set_env'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def task_download_btr():
			
 
				+    """ Download BTR submissions """
			
 
				+    return {
			
 
				+        #'file_dep': ['downloaded_data/UNFCCC/submissions-nc.csv'],
			
 
				+        # deactivate file_dep fow now as it will always run fetch submissions
			
 
				+        # before download
			
 
				+        'actions': [f"datalad run -m 'Download BTR submissions for "
			
 
				+                    f"BTR{update_btr_config['round']}' "
			
 
				+                    f"-i downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_btr.py "
			
 
				+                    f"--round={update_btr_config['round']}",
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				+                    f"--folder=downloaded_data/UNFCCC"
			
 
				+                    ],
			
 
				+        'task_dep': ['set_env'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				 def task_download_ndc():
			
 
				     """ Download NDC submissions """
			
 
				     return {