123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- #import requests
- import time
- import pandas as pd
- import re
- from pathlib import Path
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from random import randrange
- root = Path(__file__).parents[2]
- """
- Download UNFCCC Biennial Update Report submissions
- from Non-Annex I Parties and create list of submissions as CSV file
- Based on `process_bur` from national-inventory-submissions
- (https://github.com/openclimatedata/national-inventory-submisions)
- """
- # TODO for NC
- ## link is just /documents/XXXXX (but already dealt with in code below)
- ## url is https://unfccc.int/non-annex-I-NCs
- ## pattern needs NC instead of BUR
- print("Fetching NC submissions ...")
- url = "https://unfccc.int/non-annex-I-NCs"
- #print(url)
- # set options for headless mode
- options = webdriver.firefox.options.Options()
- options.add_argument('-headless')
- # create profile for headless mode and automatic downloading
- profile = webdriver.FirefoxProfile()
- # set up selenium driver
- driver = webdriver.Firefox(options=options, firefox_profile=profile)
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- table = html.find_all("table")[1]
- links = table.findAll("a")
- targets = [] # sub-pages
- downloads = []
- no_downloads = []
- # Check links for Zipfiles or subpages
- for link in links:
- if "href" not in link.attrs:
- continue
- href = link.attrs["href"]
- if "/documents/" in href:
- if "title" in link.attrs.keys():
- title = link.attrs["title"]
- else:
- title = link.contents[0]
- if href.startswith("/documents"):
- href = "https://unfccc.int" + href
- # Only add pages in the format https://unfccc.int/documents/65587
- # to further downloads
- if str(Path(href).parent).endswith("documents"):
- targets.append({"title": title, "url": href})
- pattern = re.compile(r"NC ?\d")
- #skip = True
- # Go through sub-pages.
- for target in targets:
- #if target["url"] == "https://unfccc.int/documents/199234":
- # skip = False
- #if skip:
- # print(f"Skipping { target['title']}")
- # continue
- time.sleep(randrange(5, 15))
- url = target["url"]
- #subpage = requests.get(url, timeout=15.5)
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- title = html.find("h1").contents[0]
- match = pattern.search(title)
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- kind = None
- h2 = html.find("h2", text="Versions")
- if h2:
- div = h2.findNext("div")
- links = div.findAll("a")
- try:
- country = (
- html.find("h2", text="Countries").findNext("div").findNext("div").text
- )
- except AttributeError:
- country = (
- html.find("h2", text="Corporate Author")
- .findNext("div")
- .findNext("div")
- .text
- )
- doctype = (
- html.find("h2", text="Document Type").findNext("div").findNext("div").text
- )
- for link in links:
- url = link.attrs["href"]
- if not kind:
- match = pattern.search(url.upper())
- if match:
- kind = match.group(0)
- else:
- if ("NIR" in doctype) or ("NIR" in title):
- kind = "NIR"
- elif ("INV" in title) or ("Inventory" in title):
- kind = "INV"
- else:
- print("found unknown record" + url)
- downloads.append(
- {
- "Kind": kind,
- "Country": country,
- "Title": title,
- "URL": url,
- }
- )
- print("\t".join([kind, country, title, url]))
- else:
- no_downloads.append((title, url))
- if len(no_downloads) > 0:
- print("No downloads for ", no_downloads)
- driver.close()
- df = pd.DataFrame(downloads)
- df = df[["Kind", "Country", "Title", "URL"]]
- df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)