1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # helper functions to gather submission info from UNFCCC website
- import time
- import re
- from random import randrange
- from typing import Dict, List
- from selenium.webdriver import Firefox
- from selenium.common.exceptions import WebDriverException
- from bs4 import BeautifulSoup
- def get_unfccc_submission_info(
- url: str,
- driver: Firefox,
- max_tries: int=10,
- ) -> List[Dict[str,str]]:
- info = []
- pattern = re.compile(r"BUR ?\d")
- i = 0
- last_excep = None
- while i < max_tries:
- try:
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- subtree = html.find(class_="document-title")
- title = subtree.find("span").contents[0]
- break
- except (AttributeError, WebDriverException) as excep:
- last_excep = excep
- print(f"Error fetching {url}")
- print("Retrying ...")
- time.sleep(randrange(5, 15))
- i += 1
- continue
- if i == max_tries:
- print(f"Aborting after {max_tries} tries.")
- print(last_excep)
- else:
- match = pattern.search(title)
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- kind = None
- # TODO: might improve speed by first searching for class="document-line" and then operating on thie resulting subtree for the info
- try:
- subtree = html.find_all(
- class_="field field--name-field-document-country field--type-termstore-entity-reference field--label-inline")
- country = subtree[0].find(class_="field--item").contents[0]
- except AttributeError:
- # author as backup for country
- subtree = html.find_all(class_="field--name-field-document-ca")
- country = subtree[0].find(class_="field--item").contents[0]
- # document type
- subtree = html.find_all(
- class_="field field--name-field-document-type field--type-termstore-entity-reference field--label-hidden field--items")
- doctype = subtree[0].find(class_="field--item").contents[0]
- # get files
- sub_files = html.find(
- class_=["form-select form-control", "form-select form-control download"])
- files = sub_files.find_all("option", value=True)
- files = [file.attrs['value'] for file in files]
- if len(files) > 0:
- for file in files:
- if not kind:
- match = pattern.search(file.upper())
- if match:
- kind = match.group(0)
- else:
- # TODO: check why search in filename makes sense (compared to
- # directly using doctype)
- if ("CRF" in doctype) or ("CRF" in title):
- kind = "CRF"
- elif ("SEF" in doctype) or ("SEF" in title):
- kind = "SEF"
- elif ("NIR" in doctype) or ("NIR" in title):
- kind = "NIR"
- elif "NC" in title:
- kind = "NC"
- elif "Status report" in title:
- kind = "CRF"
- else:
- kind = "other"
- info.append({
- "Kind": kind,
- "Country": country,
- "Title": title,
- "URL": file,
- })
- print("\t".join([kind, country, title, file]))
- else:
- print(f"No files found for {url}")
- return info
|