|
@@ -0,0 +1,105 @@
|
|
|
+import pandas as pd
|
|
|
+import requests
|
|
|
+import shutil
|
|
|
+import time
|
|
|
+import os
|
|
|
+from datetime import date
|
|
|
+from random import randrange
|
|
|
+
|
|
|
+from pathlib import Path
|
|
|
+root = Path(__file__).parents[2]
|
|
|
+"""
|
|
|
+based on download_bur from national-inventory-submissions
|
|
|
+# (https://github.com/openclimatedata/national-inventory-submisions)
|
|
|
+"""
|
|
|
+
|
|
|
+###############
|
|
|
+#
|
|
|
+# TODO
|
|
|
+# download directly via selenium see link below
|
|
|
+# https://sqa.stackexchange.com/questions/2197/
|
|
|
+# how-to-download-a-file-using-seleniums-webdriver
|
|
|
+###############
|
|
|
+
|
|
|
+# we use the ndc package provided by openclimatedata which is updated on
|
|
|
+# a daily basis
|
|
|
+submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
|
|
|
+submissions = pd.read_csv(submissions_url)
|
|
|
+
|
|
|
+url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
|
|
|
+
|
|
|
+# if we get files of this size they are error pages and we need to
|
|
|
+# try the download again
|
|
|
+# TODO error page sizes are from BUR and NC and might differ for NDCs
|
|
|
+# if an error page is found instead of a pdf adjust sizes here
|
|
|
+error_file_sizes = [212, 210]
|
|
|
+
|
|
|
+# Ensure download path and subfolders exist
|
|
|
+download_path = root / "downloaded_data" / "UNFCCC"
|
|
|
+if not download_path.exists():
|
|
|
+ download_path.mkdir(parents=True)
|
|
|
+
|
|
|
+new_downloaded = []
|
|
|
+
|
|
|
+
|
|
|
+for idx, submission in submissions.iterrows():
|
|
|
+ print("=" * 60)
|
|
|
+ ndc = submission.Number
|
|
|
+ title = submission.Title
|
|
|
+ url = submission.EncodedAbsUrl
|
|
|
+ submission_date = submission.SubmissionDate
|
|
|
+ country = submission.Party
|
|
|
+ country = country.replace(' ', '_')
|
|
|
+ print(title)
|
|
|
+
|
|
|
+ ndc_folder = "NDC_" + ndc + "_" + submission_date
|
|
|
+
|
|
|
+ country_folder = download_path / country
|
|
|
+ if not country_folder.exists():
|
|
|
+ country_folder.mkdir()
|
|
|
+ local_filename = country_folder / ndc_folder / url.split('/')[-1]
|
|
|
+ local_filename_underscore = \
|
|
|
+ download_path / country / ndc_folder / \
|
|
|
+ url.split('/')[-1].replace("%20", "_").replace(" ", "_")
|
|
|
+ if not local_filename.parent.exists():
|
|
|
+ local_filename.parent.mkdir()
|
|
|
+
|
|
|
+ # this should never be needed but in case anything goes wrong and
|
|
|
+ # an error page is present it should be overwritten
|
|
|
+ if local_filename_underscore.exists():
|
|
|
+ # check file size. if 210 or 212 bytes it's the error page
|
|
|
+ if Path(local_filename_underscore).stat().st_size in error_file_sizes:
|
|
|
+ # found the error page. delete file
|
|
|
+ os.remove(local_filename_underscore)
|
|
|
+
|
|
|
+ # now we have to remove error pages, so a present file should not be overwritten
|
|
|
+ if not local_filename_underscore.exists():
|
|
|
+ i = 0 # reset counter
|
|
|
+ while not local_filename_underscore.exists() and i < 10:
|
|
|
+
|
|
|
+ r = requests.get(url, stream=True)
|
|
|
+ with open(str(local_filename_underscore), 'wb') as f:
|
|
|
+ shutil.copyfileobj(r.raw, f)
|
|
|
+
|
|
|
+ # check file size. if 210 or 212 bytes it's the error page
|
|
|
+ if Path(local_filename_underscore).stat().st_size in error_file_sizes:
|
|
|
+ # found the error page. delete file
|
|
|
+ os.remove(local_filename_underscore)
|
|
|
+
|
|
|
+ # sleep a bit to avoid running into captchas
|
|
|
+ time.sleep(randrange(5, 15))
|
|
|
+
|
|
|
+ if local_filename_underscore.exists():
|
|
|
+ new_downloaded.append(submission)
|
|
|
+ print("Download => downloaded_data/UNFCCC/" + country + "/" +
|
|
|
+ ndc_folder + "/" + local_filename_underscore.name)
|
|
|
+ else:
|
|
|
+ print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
|
|
|
+ + ndc_folder + "/" + local_filename_underscore.name)
|
|
|
+
|
|
|
+ else:
|
|
|
+ print("=> Already downloaded " + local_filename_underscore.name)
|
|
|
+
|
|
|
+
|
|
|
+df = pd.DataFrame(new_downloaded)
|
|
|
+df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
|