mikapfl
/
UNFCCC_non-AnnexI_data
forked from jguetschow/UNFCCC_non-AnnexI_data


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
							import pandas as pd
import requests
import shutil
import time
import os
from datetime import date
from selenium import webdriver
from random import randrange

from pathlib import Path
root = Path(__file__).parents[2]
"""
based on download_bur from national-inventory-submissions
# (https://github.com/openclimatedata/national-inventory-submisions)
"""

###############
#
# TODO
# download directly via selenium see link below
# https://sqa.stackexchange.com/questions/2197/
# how-to-download-a-file-using-seleniums-webdriver
###############

submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
                          "submissions-bur.csv")

url = "https://unfccc.int/BURs"

# if we get files of this size they are error pages and we need to
# try the download again
error_file_sizes = [212, 210]

# find which BUR submission rounds exist
present_BURs = submissions.Kind.unique()

# Ensure download path and subfolders exist
download_path = root / "downloaded_data/UNFCCC"
if not download_path.exists():
    download_path.mkdir(parents=True)

# set options for headless mode
options = webdriver.firefox.options.Options()
# options.add_argument('-headless')

# create profile for headless mode 
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)

# set up selenium driver
driver = webdriver.Firefox(options=options, firefox_profile=profile)

# visit the main data page once to create cookies
driver.get(url)
time.sleep(20)

# get the session id cookie
cookies_selenium = driver.get_cookies()
cookies = {}
for cookie in cookies_selenium:
    cookies[cookie['name']] = cookie['value']

print(cookies)

new_downloaded = []

for idx, submission in submissions.iterrows():
    print("=" * 60)
    bur = submission.Kind
    title = submission.Title
    url = submission.URL
    country = submission.Country
    country = country.replace(' ', '_')
    print(title)

    country_folder = download_path / country
    if not country_folder.exists():
        country_folder.mkdir()
    local_filename = country_folder / bur / url.split('/')[-1]
    local_filename_underscore = \
        download_path / country / bur / \
        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
    if not local_filename.parent.exists():
        local_filename.parent.mkdir()

    ### remove, not needed as no legacy data present
    #if local_filename.exists():
    #    # rename
    #    local_filename.rename(local_filename_underscore)
    #    print("Renamed " + bur + "/" + country + "/" + local_filename.name)

    # this should never be needed but in case anything goes wrong and
    # an error page is present it should be overwritten
    if local_filename_underscore.exists():
        # check file size. if 210 or 212 bytes it's the error page
        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
            # found the error page. delete file
            os.remove(local_filename_underscore)
    
    # now we have remove error pages, so a present file should not be overwritten
    if not local_filename_underscore.exists():
        i = 0  # reset counter
        while not local_filename_underscore.exists() and i < 10:
            # for i = 0 and i = 5 try to get a new session ID
            if i == 1 or i == 5:
                driver = webdriver.Firefox(options=options,
                                           firefox_profile=profile)
    
                # visit the main data page once to create cookies
                driver.get(url)
                time.sleep(20)
                
                # get the session id cookie
                cookies_selenium = driver.get_cookies()
                cookies = {}
                for cookie in cookies_selenium:
                    cookies[cookie['name']] = cookie['value']
                    
            r = requests.get(url, stream=True, cookies=cookies)
            with open(str(local_filename_underscore), 'wb') as f:
                shutil.copyfileobj(r.raw, f)
            
            # check file size. if 210 or 212 bytes it's the error page
            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
                # found the error page. delete file
                os.remove(local_filename_underscore)
            
            # sleep a bit to avoid running into captchas
            time.sleep(randrange(5, 15))
            
        if local_filename_underscore.exists():
            new_downloaded.append(submission)
            print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
                  "/" + local_filename_underscore.name)
        else:
            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
                  + bur + "/" + local_filename_underscore.name)

    else:
        print("=> Already downloaded " + local_filename_underscore.name)

driver.close()

df = pd.DataFrame(new_downloaded)
df.to_csv(download_path / "00_new_downloads_bur-{}.csv".format(date.today()), index=False)