#import requests import time import pandas as pd import re from pathlib import Path from bs4 import BeautifulSoup from selenium import webdriver from random import randrange root = Path(__file__).parents[2] """ Download UNFCCC Biennial Update Report submissions from Non-Annex I Parties and create list of submissions as CSV file Based on `process_bur` from national-inventory-submissions (https://github.com/openclimatedata/national-inventory-submisions) """ # TODO for NC ## link is just /documents/XXXXX (but already dealt with in code below) ## url is https://unfccc.int/non-annex-I-NCs ## pattern needs NC instead of BUR print("Fetching NC submissions ...") url = "https://unfccc.int/non-annex-I-NCs" #print(url) # set options for headless mode options = webdriver.firefox.options.Options() options.add_argument('-headless') # create profile for headless mode and automatic downloading profile = webdriver.FirefoxProfile() # set up selenium driver driver = webdriver.Firefox(options=options, firefox_profile=profile) driver.get(url) html = BeautifulSoup(driver.page_source, "html.parser") table = html.find_all("table")[1] links = table.findAll("a") targets = [] # sub-pages downloads = [] no_downloads = [] # Check links for Zipfiles or subpages for link in links: if "href" not in link.attrs: continue href = link.attrs["href"] if "/documents/" in href: if "title" in link.attrs.keys(): title = link.attrs["title"] else: title = link.contents[0] if href.startswith("/documents"): href = "https://unfccc.int" + href # Only add pages in the format https://unfccc.int/documents/65587 # to further downloads if str(Path(href).parent).endswith("documents"): targets.append({"title": title, "url": href}) pattern = re.compile(r"NC ?\d") #skip = True # Go through sub-pages. for target in targets: #if target["url"] == "https://unfccc.int/documents/199234": # skip = False #if skip: # print(f"Skipping { target['title']}") # continue time.sleep(randrange(5, 15)) url = target["url"] #subpage = requests.get(url, timeout=15.5) driver.get(url) html = BeautifulSoup(driver.page_source, "html.parser") title = html.find("h1").contents[0] match = pattern.search(title) if match: kind = match.group(0).replace(" ", "") else: kind = None h2 = html.find("h2", text="Versions") if h2: div = h2.findNext("div") links = div.findAll("a") try: country = ( html.find("h2", text="Countries").findNext("div").findNext("div").text ) except AttributeError: country = ( html.find("h2", text="Corporate Author") .findNext("div") .findNext("div") .text ) doctype = ( html.find("h2", text="Document Type").findNext("div").findNext("div").text ) for link in links: url = link.attrs["href"] if not kind: match = pattern.search(url.upper()) if match: kind = match.group(0) else: if ("NIR" in doctype) or ("NIR" in title): kind = "NIR" elif ("INV" in title) or ("Inventory" in title): kind = "INV" else: print("found unknown record" + url) downloads.append( { "Kind": kind, "Country": country, "Title": title, "URL": url, } ) print("\t".join([kind, country, title, url])) else: no_downloads.append((title, url)) if len(no_downloads) > 0: print("No downloads for ", no_downloads) driver.close() df = pd.DataFrame(downloads) df = df[["Kind", "Country", "Title", "URL"]] df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)