jguetschow
/
UNFCCC_non-AnnexI_data


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
							import argparse
import pandas as pd
import requests
import shutil
import time
import os
import zipfile
from datetime import date
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from random import randrange
from pathlib import Path

from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC

###############
#
# TODO
# download directly via selenium see link below
# https://sqa.stackexchange.com/questions/2197/
# how-to-download-a-file-using-seleniums-webdriver
# for automatic downloading see https://stackoverflow.com/questions/70740163/
# python-selenium-firefox-driver-dismiss-open-save-file-popup
###############

descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
        'Based on download.py from national-inventory-submissions ' \
        '(https://github.com/openclimatedata/national-inventory-submisions)'
parser = argparse.ArgumentParser(description=descr)
parser.add_argument(
    '--category',
    help='Category to download, CRF, NIR, SEF'
)
parser.add_argument(
    '--year',
    help='Year to download'
)

args = parser.parse_args()
year = args.year
category = args.category.upper()
dataset = category + year
print(f"Downloading data for {dataset}")

# generate the correct url
url = (
    "https://unfccc.int/process/transparency-and-reporting/"
    "reporting-and-review-under-the-convention/"
    "greenhouse-gas-inventories-annex-i-parties/"
    "submissions/national-inventory-submissions-{}".format(year)
)

# TODO: move to utils as used in two places
if int(year) == 2019:
    url = (
        "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
        "reporting-and-review-under-the-convention/"
        "greenhouse-gas-inventories-annex-i-parties/"
        "national-inventory-submissions-{}".format(year)
    )
elif int(year) in range(2020,2023):
    url = (
        "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
    )
elif int(year) >= 2023:
    url = (
        "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
        "reporting-and-review-under-the-convention/"
        "greenhouse-gas-inventories-annex-i-parties/"
        "national-inventory-submissions-{}".format(year)
    )
else:
    url = (
        "https://unfccc.int/process/transparency-and-reporting/"
        "reporting-and-review-under-the-convention/"
        "greenhouse-gas-inventories-annex-i-parties/"
        "submissions/national-inventory-submissions-{}".format(year)
    )

error_file_sizes = [212, 210]

# Read submissions list
submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")

# filter submissions list or category
items = submissions[submissions.Kind  == category.upper()]

# set options for headless mode
profile_path = ".firefox"
options = Options()
#options.add_argument('-headless')

# create profile for headless mode and automatic downloading
options.set_preference('profile', profile_path)
options.set_preference('browser.download.folderList', 2)

# set up selenium driver
driver = Firefox(options=options)
# visit the main data page once to create cookies
driver.get(url)

# wait a bit for the website to load before we get the cokkies
time.sleep(20)

# get the session id cookie
cookies_selenium = driver.get_cookies()
cookies = {}
for cookie in cookies_selenium:
    cookies[cookie['name']] = cookie['value']

new_downloaded = []

for idx, submission in items.iterrows():
    print("=" * 60)
    title = submission.Title
    url = submission.URL
    country = submission.Country
    country = country.replace(' ', '_')
    print(f"Downloading {title} from {url}")

    country_folder = downloaded_data_path_UNFCCC / country
    if not country_folder.exists():
        country_folder.mkdir()
    local_filename = \
        country_folder / dataset / \
        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
    if not local_filename.parent.exists():
        local_filename.parent.mkdir()

    if local_filename.exists():
        # check file size. if 210 or 212 bytes it's the error page
        if Path(local_filename).stat().st_size in error_file_sizes:
            # found the error page. delete file
            os.remove(local_filename)
    
    # now we have removed error pages, so a present file should not be overwritten
    if (not local_filename.exists()) and (not local_filename.is_symlink()):
        i = 0  # reset counter
        while not local_filename.exists() and i < 10:
            # for i = 0 and i = 5 try to get a new session ID
            if i == 1 or i == 5:
                driver = Firefox(options=options)
    
                # visit the main data page once to create cookies
                driver.get(url)
                time.sleep(20)

                # get the session id cookie
                cookies_selenium = driver.get_cookies()
                cookies = {}
                for cookie in cookies_selenium:
                    cookies[cookie['name']] = cookie['value']

            r = requests.get(url, stream=True, cookies=cookies)
            with open(str(local_filename), 'wb') as f:
                shutil.copyfileobj(r.raw, f)
            
            # check file size. if 210 or 212 bytes it's the error page
            if Path(local_filename).stat().st_size in error_file_sizes:
                # found the error page. delete file
                os.remove(local_filename)
            
            # sleep a bit to avoid running into captchas
            time.sleep(randrange(5, 15))
            
        if local_filename.exists():
            new_downloaded.append(submission)
            print(f"Download => {local_filename.relative_to(root_path)}")
            # unzip data (only for new downloads)
            if local_filename.suffix == ".zip":
                try:
                    zipped_file = zipfile.ZipFile(str(local_filename), 'r')
                    zipped_file.extractall(str(local_filename.parent))
                    print(f"Extracted {len(zipped_file.namelist())} files.")
                    zipped_file.close()
                # TODO Better error logging/visibilty
                except zipfile.BadZipFile:
                    print(f"Error while trying to extract "
                          f"{local_filename.relative_to(root_path)}")
                except NotImplementedError:
                    print("Zip format not supported, please unzip on the command line.")
            else:
                print(f"Not attempting to extract "
                      f"{local_filename.relative_to(root_path)}.")
        else:
            print(f"Failed to download {local_filename.relative_to(root_path)}")

    else:
        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")

driver.close()

df = pd.DataFrame(new_downloaded)
df.to_csv(downloaded_data_path_UNFCCC
          / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)