1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- # helper functions to gather submission info from UNFCCC website
- import time
- import re
- from random import randrange
- from typing import Dict, List
- from selenium.webdriver import Firefox
- from bs4 import BeautifulSoup
- def get_unfccc_submission_info(
- url: str,
- driver: Firefox,
- max_tries: int=10,
- ) -> List[Dict[str,str]]:
- info = []
- pattern = re.compile(r"BUR ?\d")
- i = 0
- while i < max_tries:
- try:
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- title = html.find("h1").contents[0]
- break
- except AttributeError:
- print(f"Error fetching {url}")
- print("Retrying ...")
- time.sleep(randrange(5, 15))
- i += 1
- continue
- if i == max_tries:
- print(f"Aborting after {max_tries} tries")
- else:
- match = pattern.search(title)
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- kind = None
- h2 = html.find("h2", text="Versions")
- if h2:
- div = h2.findNext("div")
- links = div.findAll("a")
- try:
- country = (
- html.find("h2", text="Countries").findNext("div").findNext("div").text
- )
- except AttributeError:
- country = (
- html.find("h2", text="Corporate Author")
- .findNext("div")
- .findNext("div")
- .text
- )
- doctype = (
- html.find("h2", text="Document Type").findNext("div").findNext("div").text
- )
- for link in links:
- url = link.attrs["href"]
- if not kind:
- match = pattern.search(url.upper())
- if match:
- kind = match.group(0)
- else:
- if ("CRF" in doctype) or ("CRF" in title):
- kind = "CRF"
- elif ("SEF" in doctype) or ("SEF" in title):
- kind = "SEF"
- elif ("NIR" in doctype) or ("NIR" in title):
- kind = "NIR"
- elif "NC" in title:
- kind = "NC"
- elif "Status report" in title:
- kind = "CRF"
- else:
- kind = "other"
- info.append({
- "Kind": kind,
- "Country": country,
- "Title": title,
- "URL": url,
- })
- print("\t".join([kind, country, title, url]))
- else:
- print(f"No files found for {url}")
- return info
|