jguetschow
/
UNFCCC_non-AnnexI_data


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							import pandas as pd
import requests
import shutil
import time
import os
from datetime import date
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from random import randrange

from pathlib import Path
root = Path(__file__).parents[2]
"""
based on download_bur from national-inventory-submissions
# (https://github.com/openclimatedata/national-inventory-submisions)
"""

###############
#
# TODO
# download directly via selenium see link below
# https://sqa.stackexchange.com/questions/2197/
# how-to-download-a-file-using-seleniums-webdriver
# for automatic downloading see https://stackoverflow.com/questions/70740163/
# python-selenium-firefox-driver-dismiss-open-save-file-popup
###############

submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
                          "submissions-nc.csv")

url = "https://unfccc.int/non-annex-I-NCs"

# if we get files of this size they are error pages and we need to
# try the download again
error_file_sizes = [212, 210]

# Ensure download path and subfolders exist
download_path = root / "downloaded_data" / "UNFCCC"
if not download_path.exists():
    download_path.mkdir(parents=True)

# set options for headless mode
profile_path = ".firefox"
options = Options()
#options.add_argument('-headless')

# create profile for headless mode and automatic downloading
options.set_preference('profile', profile_path)
options.set_preference('browser.download.folderList', 2)

# set up selenium driver
driver = Firefox(options=options)
# visit the main data page once to create cookies
driver.get(url)

time.sleep(20)

# get the session id cookie
cookies_selenium = driver.get_cookies()
cookies = {}
for cookie in cookies_selenium:
    cookies[cookie['name']] = cookie['value']

print(cookies)

new_downloaded = []

for idx, submission in submissions.iterrows():
    print("=" * 60)
    bur = submission.Kind
    title = submission.Title
    url = submission.URL
    country = submission.Country
    country = country.replace(' ', '_')
    print(title)

    country_folder = download_path / country
    if not country_folder.exists():
        country_folder.mkdir()
    local_filename = country_folder / bur / url.split('/')[-1]
    local_filename_underscore = \
        download_path / country / bur / \
        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
    if not local_filename.parent.exists():
        local_filename.parent.mkdir()

    ### remove, not needed as no legacy data present
    #if local_filename.exists():
    #    # rename
    #    local_filename.rename(local_filename_underscore)
    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)

    # this should never be needed but in case anything goes wrong and
    # an error page is present it should be overwritten
    if local_filename_underscore.exists():
        # check file size. if 210 or 212 bytes it's the error page
        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
            # found the error page. delete file
            os.remove(local_filename_underscore)
    
    # now we have remove error pages, so a present file should not be overwritten
    if not local_filename_underscore.exists():
        i = 0  # reset counter
        while not local_filename_underscore.exists() and i < 10:
            # for i = 0 and i = 5 try to get a new session ID
            if i == 1 or i == 5:
                driver = webdriver.Firefox(options=options,
                                           firefox_profile=profile)
    
                # visit the main data page once to create cookies
                driver.get(url)
                time.sleep(20)
                
                # get the session id cookie
                cookies_selenium = driver.get_cookies()
                cookies = {}
                for cookie in cookies_selenium:
                    cookies[cookie['name']] = cookie['value']
                    
            r = requests.get(url, stream=True, cookies=cookies)
            with open(str(local_filename_underscore), 'wb') as f:
                shutil.copyfileobj(r.raw, f)
            
            # check file size. if 210 or 212 bytes it's the error page
            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
                # found the error page. delete file
                os.remove(local_filename_underscore)
            
            # sleep a bit to avoid running into captchas
            time.sleep(randrange(5, 15))
            
        if local_filename_underscore.exists():
            new_downloaded.append(submission)
            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
                  "/" + local_filename_underscore.name)
        else:
            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
                  + bur + "/" + local_filename_underscore.name)

    else:
        print("=> Already downloaded " + local_filename_underscore.name)

driver.close()

df = pd.DataFrame(new_downloaded)
df.to_csv(download_path / "00_new_downloads_nc-{}.csv".format(date.today()), index=False)