123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- import pandas as pd
- import requests
- import shutil
- import time
- import os
- from datetime import date
- from random import randrange
- from pathlib import Path
- root = Path(__file__).parents[2]
- """
- based on download_bur from national-inventory-submissions
- # (https://github.com/openclimatedata/national-inventory-submisions)
- """
- ###############
- #
- # TODO
- # download directly via selenium see link below
- # https://sqa.stackexchange.com/questions/2197/
- # how-to-download-a-file-using-seleniums-webdriver
- ###############
- # we use the ndc package provided by openclimatedata which is updated on
- # a daily basis
- submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
- submissions = pd.read_csv(submissions_url)
- url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
- # if we get files of this size they are error pages and we need to
- # try the download again
- # TODO error page sizes are from BUR and NC and might differ for NDCs
- # if an error page is found instead of a pdf adjust sizes here
- error_file_sizes = [212, 210]
- # Ensure download path and subfolders exist
- download_path = root / "downloaded_data" / "UNFCCC"
- if not download_path.exists():
- download_path.mkdir(parents=True)
- new_downloaded = []
- for idx, submission in submissions.iterrows():
- print("=" * 60)
- ndc = submission.Number
- title = submission.Title
- url = submission.EncodedAbsUrl
- submission_date = submission.SubmissionDate
- country = submission.Party
- country = country.replace(' ', '_')
- print(title)
- ndc_folder = "NDC_" + ndc + "_" + submission_date
- country_folder = download_path / country
- if not country_folder.exists():
- country_folder.mkdir()
- local_filename = country_folder / ndc_folder / url.split('/')[-1]
- local_filename_underscore = \
- download_path / country / ndc_folder / \
- url.split('/')[-1].replace("%20", "_").replace(" ", "_")
- if not local_filename.parent.exists():
- local_filename.parent.mkdir()
- # this should never be needed but in case anything goes wrong and
- # an error page is present it should be overwritten
- if local_filename_underscore.exists():
- # check file size. if 210 or 212 bytes it's the error page
- if Path(local_filename_underscore).stat().st_size in error_file_sizes:
- # found the error page. delete file
- os.remove(local_filename_underscore)
-
- # now we have to remove error pages, so a present file should not be overwritten
- if not local_filename_underscore.exists():
- i = 0 # reset counter
- while not local_filename_underscore.exists() and i < 10:
- r = requests.get(url, stream=True)
- with open(str(local_filename_underscore), 'wb') as f:
- shutil.copyfileobj(r.raw, f)
-
- # check file size. if 210 or 212 bytes it's the error page
- if Path(local_filename_underscore).stat().st_size in error_file_sizes:
- # found the error page. delete file
- os.remove(local_filename_underscore)
-
- # sleep a bit to avoid running into captchas
- time.sleep(randrange(5, 15))
-
- if local_filename_underscore.exists():
- new_downloaded.append(submission)
- print("Download => downloaded_data/UNFCCC/" + country + "/" +
- ndc_folder + "/" + local_filename_underscore.name)
- else:
- print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
- + ndc_folder + "/" + local_filename_underscore.name)
- else:
- print("=> Already downloaded " + local_filename_underscore.name)
- df = pd.DataFrame(new_downloaded)
- df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
|