3 năm trước cách đây · d4916a4f95
--- a/code/UNFCCC_downloader/download_ndc.py
+++ b/code/UNFCCC_downloader/download_ndc.py
@@ -0,0 +1,105 @@
 
				+import pandas as pd
			
 
				+import requests
			
 
				+import shutil
			
 
				+import time
			
 
				+import os
			
 
				+from datetime import date
			
 
				+from random import randrange
			
 
				+
			
 
				+from pathlib import Path
			
 
				+root = Path(__file__).parents[2]
			
 
				+"""
			
 
				+based on download_bur from national-inventory-submissions
			
 
				+# (https://github.com/openclimatedata/national-inventory-submisions)
			
 
				+"""
			
 
				+
			
 
				+###############
			
 
				+#
			
 
				+# TODO
			
 
				+# download directly via selenium see link below
			
 
				+# https://sqa.stackexchange.com/questions/2197/
			
 
				+# how-to-download-a-file-using-seleniums-webdriver
			
 
				+###############
			
 
				+
			
 
				+# we use the ndc package provided by openclimatedata which is updated on
			
 
				+# a daily basis
			
 
				+submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
			
 
				+submissions = pd.read_csv(submissions_url)
			
 
				+
			
 
				+url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
			
 
				+
			
 
				+# if we get files of this size they are error pages and we need to
			
 
				+# try the download again
			
 
				+# TODO error page sizes are from BUR and NC and might differ for NDCs
			
 
				+# if an error page is found instead of a pdf adjust sizes here
			
 
				+error_file_sizes = [212, 210]
			
 
				+
			
 
				+# Ensure download path and subfolders exist
			
 
				+download_path = root / "downloaded_data" / "UNFCCC"
			
 
				+if not download_path.exists():
			
 
				+    download_path.mkdir(parents=True)
			
 
				+
			
 
				+new_downloaded = []
			
 
				+
			
 
				+
			
 
				+for idx, submission in submissions.iterrows():
			
 
				+    print("=" * 60)
			
 
				+    ndc = submission.Number
			
 
				+    title = submission.Title
			
 
				+    url = submission.EncodedAbsUrl
			
 
				+    submission_date = submission.SubmissionDate
			
 
				+    country = submission.Party
			
 
				+    country = country.replace(' ', '_')
			
 
				+    print(title)
			
 
				+
			
 
				+    ndc_folder = "NDC_" + ndc + "_" + submission_date
			
 
				+
			
 
				+    country_folder = download_path / country
			
 
				+    if not country_folder.exists():
			
 
				+        country_folder.mkdir()
			
 
				+    local_filename = country_folder / ndc_folder / url.split('/')[-1]
			
 
				+    local_filename_underscore = \
			
 
				+        download_path / country / ndc_folder / \
			
 
				+        url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				+    if not local_filename.parent.exists():
			
 
				+        local_filename.parent.mkdir()
			
 
				+
			
 
				+    # this should never be needed but in case anything goes wrong and
			
 
				+    # an error page is present it should be overwritten
			
 
				+    if local_filename_underscore.exists():
			
 
				+        # check file size. if 210 or 212 bytes it's the error page
			
 
				+        if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				+            # found the error page. delete file
			
 
				+            os.remove(local_filename_underscore)
			
 
				+    
			
 
				+    # now we have to remove error pages, so a present file should not be overwritten
			
 
				+    if not local_filename_underscore.exists():
			
 
				+        i = 0  # reset counter
			
 
				+        while not local_filename_underscore.exists() and i < 10:
			
 
				+
			
 
				+            r = requests.get(url, stream=True)
			
 
				+            with open(str(local_filename_underscore), 'wb') as f:
			
 
				+                shutil.copyfileobj(r.raw, f)
			
 
				+            
			
 
				+            # check file size. if 210 or 212 bytes it's the error page
			
 
				+            if Path(local_filename_underscore).stat().st_size in error_file_sizes:
			
 
				+                # found the error page. delete file
			
 
				+                os.remove(local_filename_underscore)
			
 
				+            
			
 
				+            # sleep a bit to avoid running into captchas
			
 
				+            time.sleep(randrange(5, 15))
			
 
				+            
			
 
				+        if local_filename_underscore.exists():
			
 
				+            new_downloaded.append(submission)
			
 
				+            print("Download => downloaded_data/UNFCCC/" + country + "/" +
			
 
				+                  ndc_folder + "/" + local_filename_underscore.name)
			
 
				+        else:
			
 
				+            print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
			
 
				+                  + ndc_folder + "/" + local_filename_underscore.name)
			
 
				+
			
 
				+    else:
			
 
				+        print("=> Already downloaded " + local_filename_underscore.name)
			
 
				+
			
 
				+
			
 
				+df = pd.DataFrame(new_downloaded)
			
 
				+df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)