123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- # helper functions to gather submission info from UNFCCC website
- import time
- import re
- from random import randrange
- from typing import Dict, List
- from selenium.webdriver import Firefox
- from selenium.common.exceptions import WebDriverException
- from bs4 import BeautifulSoup
- def get_unfccc_submission_info(
- url: str,
- driver: Firefox,
- max_tries: int=10,
- ) -> List[Dict[str,str]]:
- info = []
- pattern = re.compile(r"BUR ?\d")
- pattern_NC = re.compile(r"NC ?\d")
- i = 0
- last_excep = None
- while i < max_tries:
- try:
- driver.get(url)
- html = BeautifulSoup(driver.page_source, "html.parser")
- subtree = html.find(class_="document-title")
- title = subtree.find("span").contents[0]
- break
- except (AttributeError, WebDriverException) as excep:
- last_excep = excep
- print(f"Error fetching {url}")
- print("Retrying ...")
- time.sleep(randrange(5, 15))
- i += 1
- continue
- if i == max_tries:
- print(f"Aborting after {max_tries} tries.")
- print(last_excep)
- else:
- match = pattern.search(title)
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- match = pattern_NC.search(title)
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- kind = None
- # TODO: might improve speed by first searching for class="document-line" and then operating on thie resulting subtree for the info
- try:
- subtree = html.find_all(
- class_="field field--name-field-document-country field--type-termstore-entity-reference field--label-inline")
- country = subtree[0].find(class_="field--item").contents[0]
- except (AttributeError, IndexError) as e:
- # author as backup for country
- subtree = html.find_all(class_="field--name-field-document-ca")
- country = subtree[0].find(class_="field--item").contents[0]
- # document type
- subtree = html.find_all(
- class_="field field--name-field-document-type field--type-termstore-entity-reference field--label-hidden field--items")
- doctype = subtree[0].find(class_="field--item").contents[0]
- # get files
- sub_files = html.find(
- class_=["form-select form-control", "form-select form-control download"])
- if sub_files:
- files = sub_files.find_all("option", value=True)
- files = [file.attrs['value'] for file in files]
- else:
- files = []
- if len(files) > 0:
- for file in files:
- if not kind:
- match = pattern.search(file.upper())
- if match:
- kind = match.group(0)
- else:
- match = pattern_NC.search(file.upper())
- if match:
- kind = match.group(0).replace(" ", "")
- else:
- if ("CRF" in doctype) or ("CRF" in title):
- kind = "CRF"
- elif ("SEF" in doctype) or ("SEF" in title):
- kind = "SEF"
- elif ("NIR" in doctype) or ("NIR" in title):
- kind = "NIR"
- elif "NC" in title:
- kind = "NC"
- elif "Status report" in title:
- kind = "CRF"
- else:
- kind = "other"
- info.append({
- "Kind": kind,
- "Country": country,
- "Title": title,
- "URL": file,
- })
- print("\t".join([kind, country, title, file]))
- else:
- print(f"No files found for {url}")
- return info
|