123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import argparse
- import time
- import pandas as pd
- from pathlib import Path
- from bs4 import BeautifulSoup
- from selenium.webdriver import Firefox
- from selenium.webdriver.firefox.options import Options
- from random import randrange
- from unfccc_submission_info import get_unfccc_submission_info
- root = Path(__file__).absolute().parents[2]
- max_tries = 10
- descr = ("Download UNFCCC National Inventory Submissions lists "
- "and create list of submissions as CSV file. Based on "
- "process.py from national-inventory-submissions "
- "(https://github.com/openclimatedata/national-inventory-submisions)")
- parser = argparse.ArgumentParser(description=descr)
- parser.add_argument(
- '--year',
- help='Year to download'
- )
- args = parser.parse_args()
- year = args.year
- print("Fetching submissions for {}".format(year))
- url = (
- "https://unfccc.int/process/transparency-and-reporting/"
- "reporting-and-review-under-the-convention/"
- "greenhouse-gas-inventories-annex-i-parties/"
- "submissions/national-inventory-submissions-{}".format(year)
- )
- if int(year) == 2019:
- url = (
- "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
- "reporting-and-review-under-the-convention/"
- "greenhouse-gas-inventories-annex-i-parties/"
- "national-inventory-submissions-{}".format(year)
- )
- if int(year) >= 2020:
- url = (
- "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
- )
- print(f"Using {url} to get submissions list")
- # set options for headless mode
- profile_path = ".firefox"
- options = Options()
- options.add_argument('-headless')
- # create profile for headless mode and automatic downloading
- options.set_preference('profile', profile_path)
- # set up selenium driver
- driver = Firefox(options=options)
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- table = html.find("table")
- # check if table found. if not the get command didn't work, likely because of a captcha on the site
- ### TODO replace by error message
- if not table:
- # try to load html file from disk
- print('Download failed, trying to load manually downloaded file')
- file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
- content = file.read()
- html = BeautifulSoup(content, "html.parser")
- table = html.find("table")
- if not table:
- print(
- "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
- " not found")
- exit()
- links = table.findAll('a')
- targets = [] # sub-pages
- downloads = []
- no_downloads = []
- # Check links for Zipfiles or subpages
- for link in links:
- if "href" not in link.attrs:
- continue
- href = link.attrs["href"]
- if "/documents/" in href:
- if "title" in link.attrs.keys():
- title = link.attrs["title"]
- else:
- title = link.contents[0]
- if href.startswith("/documents"):
- href = "https://unfccc.int" + href
- # Only add pages in the format https://unfccc.int/documents/65587
- # to further downloads
- if str(Path(href).parent).endswith("documents"):
- targets.append({"title": title, "url": href})
- elif href.endswith(".zip"):
- if href.startswith("/files"):
- href = "https://unfccc.int" + href
- country = Path(href).name.split("-")[0].upper()
- title = f"{country} {link.contents[0]}"
- filename = Path(href).name
- file_parts = filename.split('-')
- if len(file_parts) >= 2:
- kind = file_parts[2].upper()
- elif filename.startswith('asr'):
- kind = 'CRF'
- else:
- kind = None
- print("\t".join([kind, country, title, href]))
- downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
- # Go through sub-pages.
- for target in targets:
- time.sleep(randrange(5, 15))
- url = target["url"]
- submission_info = get_unfccc_submission_info(url, driver, 10)
- if submission_info:
- downloads = downloads + submission_info
- else:
- no_downloads.append({target["title"], url})
- if len(no_downloads) > 0:
- print("No downloads for ", no_downloads)
- driver.close()
- df = pd.DataFrame(downloads)
- df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
|