|
@@ -1,3 +1,4 @@
|
|
|
|
+import argparse
|
|
import pandas as pd
|
|
import pandas as pd
|
|
import requests
|
|
import requests
|
|
import shutil
|
|
import shutil
|
|
@@ -7,13 +8,9 @@ from datetime import date
|
|
from selenium.webdriver import Firefox
|
|
from selenium.webdriver import Firefox
|
|
from selenium.webdriver.firefox.options import Options
|
|
from selenium.webdriver.firefox.options import Options
|
|
from random import randrange
|
|
from random import randrange
|
|
-
|
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
|
+
|
|
root = Path(__file__).parents[2]
|
|
root = Path(__file__).parents[2]
|
|
-"""
|
|
|
|
-based on download_bur from national-inventory-submissions
|
|
|
|
-# (https://github.com/openclimatedata/national-inventory-submisions)
|
|
|
|
-"""
|
|
|
|
|
|
|
|
###############
|
|
###############
|
|
#
|
|
#
|
|
@@ -25,19 +22,31 @@ based on download_bur from national-inventory-submissions
|
|
# python-selenium-firefox-driver-dismiss-open-save-file-popup
|
|
# python-selenium-firefox-driver-dismiss-open-save-file-popup
|
|
###############
|
|
###############
|
|
|
|
|
|
-submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
|
|
|
|
- "submissions-nc.csv")
|
|
|
|
|
|
+descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
|
|
|
|
+ 'Based on download_bur.py from national-inventory-submissions ' \
|
|
|
|
+ '(https://github.com/openclimatedata/national-inventory-submisions)'
|
|
|
|
+parser = argparse.ArgumentParser(description=descr)
|
|
|
|
+parser.add_argument(
|
|
|
|
+ '--category',
|
|
|
|
+ help='Category to download: BUR, NC'
|
|
|
|
+)
|
|
|
|
|
|
-url = "https://unfccc.int/non-annex-I-NCs"
|
|
|
|
|
|
+args = parser.parse_args()
|
|
|
|
+category = args.category.upper()
|
|
|
|
+print(f"Downloading {category} submissions")
|
|
|
|
+
|
|
|
|
+if category == "BUR":
|
|
|
|
+ url = "https://unfccc.int/BURs"
|
|
|
|
+else:
|
|
|
|
+ url = "https://unfccc.int/non-annex-I-NCs"
|
|
|
|
|
|
# if we get files of this size they are error pages and we need to
|
|
# if we get files of this size they are error pages and we need to
|
|
# try the download again
|
|
# try the download again
|
|
error_file_sizes = [212, 210]
|
|
error_file_sizes = [212, 210]
|
|
|
|
|
|
-# Ensure download path and subfolders exist
|
|
|
|
|
|
+# Read submissions list
|
|
download_path = root / "downloaded_data" / "UNFCCC"
|
|
download_path = root / "downloaded_data" / "UNFCCC"
|
|
-if not download_path.exists():
|
|
|
|
- download_path.mkdir(parents=True)
|
|
|
|
|
|
+submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
|
|
|
|
|
|
# set options for headless mode
|
|
# set options for headless mode
|
|
profile_path = ".firefox"
|
|
profile_path = ".firefox"
|
|
@@ -53,6 +62,7 @@ driver = Firefox(options=options)
|
|
# visit the main data page once to create cookies
|
|
# visit the main data page once to create cookies
|
|
driver.get(url)
|
|
driver.get(url)
|
|
|
|
|
|
|
|
+# wait a bit for the website to load before we get the cookies
|
|
time.sleep(20)
|
|
time.sleep(20)
|
|
|
|
|
|
# get the session id cookie
|
|
# get the session id cookie
|
|
@@ -61,86 +71,72 @@ cookies = {}
|
|
for cookie in cookies_selenium:
|
|
for cookie in cookies_selenium:
|
|
cookies[cookie['name']] = cookie['value']
|
|
cookies[cookie['name']] = cookie['value']
|
|
|
|
|
|
-print(cookies)
|
|
|
|
-
|
|
|
|
new_downloaded = []
|
|
new_downloaded = []
|
|
|
|
|
|
for idx, submission in submissions.iterrows():
|
|
for idx, submission in submissions.iterrows():
|
|
print("=" * 60)
|
|
print("=" * 60)
|
|
- bur = submission.Kind
|
|
|
|
|
|
+ kind = submission.Kind
|
|
title = submission.Title
|
|
title = submission.Title
|
|
url = submission.URL
|
|
url = submission.URL
|
|
country = submission.Country
|
|
country = submission.Country
|
|
country = country.replace(' ', '_')
|
|
country = country.replace(' ', '_')
|
|
- print(title)
|
|
|
|
|
|
+ print(f"Downloading {title} from {url}")
|
|
|
|
|
|
country_folder = download_path / country
|
|
country_folder = download_path / country
|
|
if not country_folder.exists():
|
|
if not country_folder.exists():
|
|
country_folder.mkdir()
|
|
country_folder.mkdir()
|
|
- local_filename = country_folder / bur / url.split('/')[-1]
|
|
|
|
- local_filename_underscore = \
|
|
|
|
- download_path / country / bur / \
|
|
|
|
|
|
+ local_filename = \
|
|
|
|
+ country_folder / kind / \
|
|
url.split('/')[-1].replace("%20", "_").replace(" ", "_")
|
|
url.split('/')[-1].replace("%20", "_").replace(" ", "_")
|
|
if not local_filename.parent.exists():
|
|
if not local_filename.parent.exists():
|
|
local_filename.parent.mkdir()
|
|
local_filename.parent.mkdir()
|
|
|
|
|
|
- ### remove, not needed as no legacy data present
|
|
|
|
- #if local_filename.exists():
|
|
|
|
- # # rename
|
|
|
|
- # local_filename.rename(local_filename_underscore)
|
|
|
|
- # print("Renamed " + bur + "/" + country + "/" + local_filename.name)
|
|
|
|
-
|
|
|
|
- # this should never be needed but in case anything goes wrong and
|
|
|
|
- # an error page is present it should be overwritten
|
|
|
|
- if local_filename_underscore.exists():
|
|
|
|
|
|
+ if local_filename.exists():
|
|
# check file size. if 210 or 212 bytes it's the error page
|
|
# check file size. if 210 or 212 bytes it's the error page
|
|
- if Path(local_filename_underscore).stat().st_size in error_file_sizes:
|
|
|
|
|
|
+ if Path(local_filename).stat().st_size in error_file_sizes:
|
|
# found the error page. delete file
|
|
# found the error page. delete file
|
|
- os.remove(local_filename_underscore)
|
|
|
|
|
|
+ os.remove(local_filename)
|
|
|
|
|
|
- # now we have remove error pages, so a present file should not be overwritten
|
|
|
|
- if not local_filename_underscore.exists():
|
|
|
|
|
|
+ # now we have removed error pages, so a present file should not be overwritten
|
|
|
|
+ if not local_filename.exists():
|
|
i = 0 # reset counter
|
|
i = 0 # reset counter
|
|
- while not local_filename_underscore.exists() and i < 10:
|
|
|
|
|
|
+ while not local_filename.exists() and i < 10:
|
|
# for i = 0 and i = 5 try to get a new session ID
|
|
# for i = 0 and i = 5 try to get a new session ID
|
|
if i == 1 or i == 5:
|
|
if i == 1 or i == 5:
|
|
- driver = webdriver.Firefox(options=options,
|
|
|
|
- firefox_profile=profile)
|
|
|
|
|
|
+ driver = Firefox(options=options)
|
|
|
|
|
|
# visit the main data page once to create cookies
|
|
# visit the main data page once to create cookies
|
|
driver.get(url)
|
|
driver.get(url)
|
|
time.sleep(20)
|
|
time.sleep(20)
|
|
-
|
|
|
|
|
|
+
|
|
# get the session id cookie
|
|
# get the session id cookie
|
|
cookies_selenium = driver.get_cookies()
|
|
cookies_selenium = driver.get_cookies()
|
|
cookies = {}
|
|
cookies = {}
|
|
for cookie in cookies_selenium:
|
|
for cookie in cookies_selenium:
|
|
cookies[cookie['name']] = cookie['value']
|
|
cookies[cookie['name']] = cookie['value']
|
|
-
|
|
|
|
|
|
+
|
|
r = requests.get(url, stream=True, cookies=cookies)
|
|
r = requests.get(url, stream=True, cookies=cookies)
|
|
- with open(str(local_filename_underscore), 'wb') as f:
|
|
|
|
|
|
+ with open(str(local_filename), 'wb') as f:
|
|
shutil.copyfileobj(r.raw, f)
|
|
shutil.copyfileobj(r.raw, f)
|
|
|
|
|
|
# check file size. if 210 or 212 bytes it's the error page
|
|
# check file size. if 210 or 212 bytes it's the error page
|
|
- if Path(local_filename_underscore).stat().st_size in error_file_sizes:
|
|
|
|
|
|
+ if Path(local_filename).stat().st_size in error_file_sizes:
|
|
# found the error page. delete file
|
|
# found the error page. delete file
|
|
- os.remove(local_filename_underscore)
|
|
|
|
|
|
+ os.remove(local_filename)
|
|
|
|
|
|
# sleep a bit to avoid running into captchas
|
|
# sleep a bit to avoid running into captchas
|
|
time.sleep(randrange(5, 15))
|
|
time.sleep(randrange(5, 15))
|
|
|
|
|
|
- if local_filename_underscore.exists():
|
|
|
|
|
|
+ if local_filename.exists():
|
|
new_downloaded.append(submission)
|
|
new_downloaded.append(submission)
|
|
- print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
|
|
|
|
- "/" + local_filename_underscore.name)
|
|
|
|
|
|
+ print(f"Download => {local_filename.relative_to(root)}")
|
|
else:
|
|
else:
|
|
- print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
|
|
|
|
- + bur + "/" + local_filename_underscore.name)
|
|
|
|
|
|
+ print(f"Failed to download {local_filename.relative_to(root)}")
|
|
|
|
|
|
else:
|
|
else:
|
|
- print("=> Already downloaded " + local_filename_underscore.name)
|
|
|
|
|
|
+ print(f"=> Already downloaded {local_filename.relative_to(root)}")
|
|
|
|
|
|
driver.close()
|
|
driver.close()
|
|
|
|
|
|
df = pd.DataFrame(new_downloaded)
|
|
df = pd.DataFrame(new_downloaded)
|
|
-df.to_csv(download_path / "00_new_downloads_nc-{}.csv".format(date.today()), index=False)
|
|
|
|
|
|
+df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
|