mikapfl
/
UNFCCC_non-AnnexI_data
ответвлено от jguetschow/UNFCCC_non-AnnexI_data


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							import pandas as pd
import requests
import shutil
import time
import os
from datetime import date
from random import randrange

from pathlib import Path
root = Path(__file__).parents[2]
"""
based on download_bur from national-inventory-submissions
# (https://github.com/openclimatedata/national-inventory-submisions)
"""

###############
#
# TODO
# download directly via selenium see link below
# https://sqa.stackexchange.com/questions/2197/
# how-to-download-a-file-using-seleniums-webdriver
###############

# we use the ndc package provided by openclimatedata which is updated on
# a daily basis
submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
submissions = pd.read_csv(submissions_url)

url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"

# if we get files of this size they are error pages and we need to
# try the download again
# TODO error page sizes are from BUR and NC and might differ for NDCs
# if an error page is found instead of a pdf adjust sizes here
error_file_sizes = [212, 210]

# Ensure download path and subfolders exist
download_path = root / "downloaded_data" / "UNFCCC"
if not download_path.exists():
    download_path.mkdir(parents=True)

new_downloaded = []


for idx, submission in submissions.iterrows():
    print("=" * 60)
    ndc = submission.Number
    title = submission.Title
    url = submission.EncodedAbsUrl
    submission_date = submission.SubmissionDate
    country = submission.Party
    country = country.replace(' ', '_')
    print(title)

    ndc_folder = "NDC_" + ndc + "_" + submission_date

    country_folder = download_path / country
    if not country_folder.exists():
        country_folder.mkdir()
    local_filename = country_folder / ndc_folder / url.split('/')[-1]
    local_filename_underscore = \
        download_path / country / ndc_folder / \
        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
    if not local_filename.parent.exists():
        local_filename.parent.mkdir()

    # this should never be needed but in case anything goes wrong and
    # an error page is present it should be overwritten
    if local_filename_underscore.exists():
        # check file size. if 210 or 212 bytes it's the error page
        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
            # found the error page. delete file
            os.remove(local_filename_underscore)
    
    # now we have to remove error pages, so a present file should not be overwritten
    if not local_filename_underscore.exists():
        i = 0  # reset counter
        while not local_filename_underscore.exists() and i < 10:

            r = requests.get(url, stream=True)
            with open(str(local_filename_underscore), 'wb') as f:
                shutil.copyfileobj(r.raw, f)
            
            # check file size. if 210 or 212 bytes it's the error page
            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
                # found the error page. delete file
                os.remove(local_filename_underscore)
            
            # sleep a bit to avoid running into captchas
            time.sleep(randrange(5, 15))
            
        if local_filename_underscore.exists():
            new_downloaded.append(submission)
            print("Download => downloaded_data/UNFCCC/" + country + "/" +
                  ndc_folder + "/" + local_filename_underscore.name)
        else:
            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
                  + ndc_folder + "/" + local_filename_underscore.name)

    else:
        print("=> Already downloaded " + local_filename_underscore.name)


df = pd.DataFrame(new_downloaded)
df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)