123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- import argparse
- import pandas as pd
- import requests
- import shutil
- import time
- import os
- import zipfile
- from datetime import date
- from selenium.webdriver import Firefox
- from selenium.webdriver.firefox.options import Options
- from random import randrange
- from pathlib import Path
- root = Path(__file__).parents[2]
- descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
- 'Based on download.py from national-inventory-submissions ' \
- '(https://github.com/openclimatedata/national-inventory-submisions)'
- parser = argparse.ArgumentParser(description=descr)
- parser.add_argument(
- '--category',
- help='Category to download, CRF, NIR, SEF'
- )
- parser.add_argument(
- '--year',
- help='Year to download'
- )
- args = parser.parse_args()
- year = args.year
- category = args.category.upper()
- dataset = category + year
- print(f"Downloading data for {dataset}")
- url = (
- "https://unfccc.int/process/transparency-and-reporting/"
- "reporting-and-review-under-the-convention/"
- "greenhouse-gas-inventories-annex-i-parties/"
- "submissions/national-inventory-submissions-{}".format(year)
- )
- if int(year) == 2019:
- url = (
- "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
- "reporting-and-review-under-the-convention/"
- "greenhouse-gas-inventories-annex-i-parties/"
- "national-inventory-submissions-{}".format(year)
- )
- if int(year) >= 2020:
- url = (
- "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
- )
- download_path = root / "downloaded_data" / "UNFCCC"
- error_file_sizes = [212, 210]
- submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
- items = submissions[submissions.Kind == category.upper()]
- profile_path = ".firefox"
- options = Options()
- options.set_preference('profile', profile_path)
- options.set_preference('browser.download.folderList', 2)
- driver = Firefox(options=options)
- driver.get(url)
- time.sleep(20)
- cookies_selenium = driver.get_cookies()
- cookies = {}
- for cookie in cookies_selenium:
- cookies[cookie['name']] = cookie['value']
- new_downloaded = []
- for idx, submission in items.iterrows():
- print("=" * 60)
- title = submission.Title
- url = submission.URL
- country = submission.Country
- country = country.replace(' ', '_')
- print(f"Downloading {title} from {url}")
- country_folder = download_path / country
- if not country_folder.exists():
- country_folder.mkdir()
- local_filename = \
- country_folder / dataset / \
- url.split('/')[-1].replace("%20", "_").replace(" ", "_")
- if not local_filename.parent.exists():
- local_filename.parent.mkdir()
- if local_filename.exists():
-
- if Path(local_filename).stat().st_size in error_file_sizes:
-
- os.remove(local_filename)
-
-
- if not local_filename.exists():
- i = 0
- while not local_filename.exists() and i < 10:
-
- if i == 1 or i == 5:
- driver = Firefox(options=options)
-
-
- driver.get(url)
- time.sleep(20)
-
- cookies_selenium = driver.get_cookies()
- cookies = {}
- for cookie in cookies_selenium:
- cookies[cookie['name']] = cookie['value']
- r = requests.get(url, stream=True, cookies=cookies)
- with open(str(local_filename), 'wb') as f:
- shutil.copyfileobj(r.raw, f)
-
-
- if Path(local_filename).stat().st_size in error_file_sizes:
-
- os.remove(local_filename)
-
-
- time.sleep(randrange(5, 15))
-
- if local_filename.exists():
- new_downloaded.append(submission)
- print(f"Download => {local_filename.relative_to(root)}")
-
- if local_filename.suffix == ".zip":
- try:
- zipped_file = zipfile.ZipFile(str(local_filename), 'r')
- zipped_file.extractall(str(local_filename.parent))
- print(f"Extracted {len(zipped_file.namelist())} files.")
- zipped_file.close()
-
- except zipfile.BadZipFile:
- print(f"Error while trying to extract {local_filename.relative_to(root)}")
- except NotImplementedError:
- print("Zip format not supported, please unzip on the command line.")
- else:
- print(f"Not attempting to extract {local_filename.relative_to(root)}.")
- else:
- print(f"Failed to download {local_filename.relative_to(root)}")
- else:
- print(f"=> Already downloaded {local_filename.relative_to(root)}")
- driver.close()
- df = pd.DataFrame(new_downloaded)
- df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
|