Преглед на файлове

Add code for BTR downloading

Johannes Gütschow преди 10 месеца
родител
ревизия
34660b7f47

+ 157 - 0
UNFCCC_GHG_data/UNFCCC_downloader/download_btr.py

@@ -0,0 +1,157 @@
+import argparse
+import pandas as pd
+import requests
+import shutil
+import time
+import os
+import zipfile
+from datetime import date
+from selenium.webdriver import Firefox
+from selenium.webdriver.firefox.options import Options
+from random import randrange
+from pathlib import Path
+
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
+from unfccc_submission_info import get_BTR_name_and_URL
+
+###############
+#
+# TODO
+# download directly via selenium see link below
+# https://sqa.stackexchange.com/questions/2197/
+# how-to-download-a-file-using-seleniums-webdriver
+# for automatic downloading see https://stackoverflow.com/questions/70740163/
+# python-selenium-firefox-driver-dismiss-open-save-file-popup
+###############
+
+descr = 'Download and unzip data from UNFCCC Biannial Transparency Reports Submissions. ' \
+        'Based on download.py from national-inventory-submissions ' \
+        '(https://github.com/openclimatedata/national-inventory-submisions)'
+parser = argparse.ArgumentParser(description=descr)
+
+parser.add_argument(
+    '--round',
+    help='Submission round to download, e.g. 1'
+)
+
+args = parser.parse_args()
+submission_round = int(args.round)
+
+round_name, url = get_BTR_name_and_URL(submission_round)
+dataset = f"BTR{submission_round}"
+
+print(f"Downloading data for {round_name} BTRs")
+
+error_file_sizes = [212, 210]
+
+# Read submissions list
+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{dataset}.csv")
+
+# set options for headless mode
+profile_path = ".firefox"
+options = Options()
+#options.add_argument('-headless')
+
+# create profile for headless mode and automatic downloading
+options.set_preference('profile', profile_path)
+options.set_preference('browser.download.folderList', 2)
+
+# set up selenium driver
+driver = Firefox(options=options)
+# visit the main data page once to create cookies
+driver.get(url)
+
+# wait a bit for the website to load before we get the cookies
+time.sleep(20)
+
+# get the session id cookie
+cookies_selenium = driver.get_cookies()
+cookies = {}
+for cookie in cookies_selenium:
+    cookies[cookie['name']] = cookie['value']
+
+new_downloaded = []
+
+for idx, submission in submissions.iterrows():
+    print("=" * 60)
+    title = submission.Title
+    url = submission.URL
+    country = submission.Country
+    country = country.replace(' ', '_')
+    print(f"Downloading {title} from {url}")
+
+    country_folder = downloaded_data_path_UNFCCC / country
+    if not country_folder.exists():
+        country_folder.mkdir()
+    local_filename = \
+        country_folder / dataset / \
+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
+    if not local_filename.parent.exists():
+        local_filename.parent.mkdir()
+
+    if local_filename.exists():
+        # check file size. if 210 or 212 bytes it's the error page
+        if Path(local_filename).stat().st_size in error_file_sizes:
+            # found the error page. delete file
+            os.remove(local_filename)
+    
+    # now we have removed error pages, so a present file should not be overwritten
+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
+        i = 0  # reset counter
+        while not local_filename.exists() and i < 10:
+            # for i = 0 and i = 5 try to get a new session ID
+            if i == 1 or i == 5:
+                driver = Firefox(options=options)
+    
+                # visit the main data page once to create cookies
+                driver.get(url)
+                time.sleep(20)
+
+                # get the session id cookie
+                cookies_selenium = driver.get_cookies()
+                cookies = {}
+                for cookie in cookies_selenium:
+                    cookies[cookie['name']] = cookie['value']
+
+            r = requests.get(url, stream=True, cookies=cookies)
+            with open(str(local_filename), 'wb') as f:
+                shutil.copyfileobj(r.raw, f)
+            
+            # check file size. if 210 or 212 bytes it's the error page
+            if Path(local_filename).stat().st_size in error_file_sizes:
+                # found the error page. delete file
+                os.remove(local_filename)
+            
+            # sleep a bit to avoid running into captchas
+            time.sleep(randrange(5, 15))
+            
+        if local_filename.exists():
+            new_downloaded.append(submission)
+            print(f"Download => {local_filename.relative_to(root_path)}")
+            # unzip data (only for new downloads)
+            if local_filename.suffix == ".zip":
+                try:
+                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
+                    zipped_file.extractall(str(local_filename.parent))
+                    print(f"Extracted {len(zipped_file.namelist())} files.")
+                    zipped_file.close()
+                # TODO Better error logging/visibilty
+                except zipfile.BadZipFile:
+                    print(f"Error while trying to extract "
+                          f"{local_filename.relative_to(root_path)}")
+                except NotImplementedError:
+                    print("Zip format not supported, please unzip on the command line.")
+            else:
+                print(f"Not attempting to extract "
+                      f"{local_filename.relative_to(root_path)}.")
+        else:
+            print(f"Failed to download {local_filename.relative_to(root_path)}")
+
+    else:
+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
+
+driver.close()
+
+df = pd.DataFrame(new_downloaded)
+df.to_csv(downloaded_data_path_UNFCCC
+          / f"00_new_downloads_{dataset}-{date.today()}.csv", index=False)

+ 97 - 0
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_btr.py

@@ -0,0 +1,97 @@
+import argparse
+import time
+import pandas as pd
+
+from pathlib import Path
+from bs4 import BeautifulSoup
+from selenium.webdriver import Firefox
+from selenium.webdriver.firefox.options import Options
+from random import randrange
+from unfccc_submission_info import (get_unfccc_submission_info,
+                                    get_BTR_name_and_URL)
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
+
+max_tries = 10
+
+descr = ("Download UNFCCC Biannial Transparency Reports Submissions lists "
+         "and create list of submissions as CSV file. Based on "
+         "process.py from national-inventory-submissions "
+         "(https://github.com/openclimatedata/national-inventory-submisions)")
+parser = argparse.ArgumentParser(description=descr)
+parser.add_argument(
+    '--round',
+    help='1 for first BTRs, 2 for second BTRs etc.'
+)
+
+args = parser.parse_args()
+submission_round = int(args.round)
+
+round_name, url = get_BTR_name_and_URL(submission_round)
+
+print(f"Fetching submissions for {round_name} BTRs")
+print(f"Using {url} to get submissions list")
+
+# set options for headless mode
+profile_path = ".firefox"
+options = Options()
+options.add_argument('-headless')
+
+# create profile for headless mode and automatic downloading
+options.set_preference('profile', profile_path)
+
+# set up selenium driver
+driver = Firefox(options=options)
+driver.get(url)
+
+html = BeautifulSoup(driver.page_source, "html.parser")
+
+table = html.find("table")
+
+# check if table found. if not the get command didn't work, likely because of a captcha on the site
+### TODO replace by error message
+if not table:
+    raise RuntimeError('No table found on URL. Possibly due to a captcha.')
+
+links = table.findAll('a')
+
+targets = []  # sub-pages
+downloads = []
+no_downloads = []
+
+# Check links for Zipfiles or subpages
+for link in links:
+    if "href" not in link.attrs:
+        continue
+    href = link.attrs["href"]
+    if "/documents/" in href:
+        if "title" in link.attrs.keys():
+            title = link.attrs["title"]
+        else:
+            title = link.contents[0]
+        if href.startswith("/documents"):
+            href = "https://unfccc.int" + href
+        # Only add pages in the format https://unfccc.int/documents/65587
+        # to further downloads
+        if str(Path(href).parent).endswith("documents"):
+            targets.append({"title": title, "url": href})
+    else:
+        print(f"Ignored link: {href}: not in the right format.")
+
+# Go through sub-pages.
+for target in targets:
+    time.sleep(randrange(5, 15))
+    url = target["url"]
+
+    submission_info = get_unfccc_submission_info(url, driver, 10)
+
+    if submission_info:
+        downloads = downloads + submission_info
+    else:
+        no_downloads.append({target["title"], url})
+
+if len(no_downloads) > 0:
+    print("No downloads for ", no_downloads)
+
+driver.close()
+df = pd.DataFrame(downloads)
+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-BTR{submission_round}.csv", index=False)

+ 31 - 8
UNFCCC_GHG_data/UNFCCC_downloader/unfccc_submission_info.py

@@ -82,16 +82,14 @@ def get_unfccc_submission_info(
                         if match:
                             kind = match.group(0).replace(" ", "")
                         else:
-                            if ("CRF" in doctype) or ("CRF" in title):
-                                kind = "CRF"
-                            elif ("SEF" in doctype) or ("SEF" in title):
-                                kind = "SEF"
+                            if ("CRT" in doctype) or ("CRT" in title):
+                                kind = "CRT"
+                            elif ("NID" in doctype) or ("NID" in title):
+                                kind = "NID"
                             elif ("NIR" in doctype) or ("NIR" in title):
                                 kind = "NIR"
-                            elif "NC" in title:
-                                kind = "NC"
-                            elif "Status report" in title:
-                                kind = "CRF"
+                            elif ("BRT" in doctype) or ("BTR" in title):
+                                kind = "BTR"
                             else:
                                 kind = "other"
                 info.append({
@@ -106,3 +104,28 @@ def get_unfccc_submission_info(
             print(f"No files found for {url}")
 
     return info
+
+
+def get_BTR_name_and_URL(submission_round: int) -> (str, str):
+    """
+        Get the name and URL of a BTR for a given number
+
+    Parameters
+    ----------
+    submission_round (int)
+        submission_round of the BTRs e.g. 1
+
+    Returns
+    -------
+    name (str): name of the BTR submission round, e.g. 'first'
+    URL (str): URL of the submission page on the UNFCCC website
+
+    """
+
+    if submission_round == 1:
+        name = "first"
+        URL = "https://unfccc.int/first-biennial-transparency-reports"
+    else:
+        raise ValueError(f"Submission round {submission_round} is not defined")
+
+    return name, URL

+ 43 - 0
dodo.py

@@ -156,6 +156,49 @@ def task_download_annexi():
     }
 
 
+# annexI data: one update call for all data types (as they are on one page)
+# but for each year separately.
+# downloading is per year and
+update_btr_config = {
+    "round": get_var('round', None),
+}
+
+def task_update_btr():
+    """ Update list of BTR submissions """
+    return {
+        'targets': [f"downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv"],
+        'actions': [f"datalad run -m 'Fetch Biannial Transparency Report submissions for BTR{update_btr_config['round']}' "
+                    "--explicit "
+                    f"-o downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv "
+                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_btr.py "
+                    f"--round={update_btr_config['round']}"],
+        'task_dep': ['set_env'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+
+def task_download_btr():
+    """ Download BTR submissions """
+    return {
+        #'file_dep': ['downloaded_data/UNFCCC/submissions-nc.csv'],
+        # deactivate file_dep fow now as it will always run fetch submissions
+        # before download
+        'actions': [f"datalad run -m 'Download BTR submissions for "
+                    f"BTR{update_btr_config['round']}' "
+                    f"-i downloaded_data/UNFCCC/submissions-BTR{update_btr_config['round']}.csv "
+                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_btr.py "
+                    f"--round={update_btr_config['round']}",
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
+                    f"--folder=downloaded_data/UNFCCC"
+                    ],
+        'task_dep': ['set_env'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+
+
 def task_download_ndc():
     """ Download NDC submissions """
     return {