unfccc_submission_info.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # helper functions to gather submission info from UNFCCC website
  2. import time
  3. import re
  4. from random import randrange
  5. from typing import Dict, List
  6. from selenium.webdriver import Firefox
  7. from selenium.common.exceptions import WebDriverException
  8. from bs4 import BeautifulSoup
  9. def get_unfccc_submission_info(
  10. url: str,
  11. driver: Firefox,
  12. max_tries: int=10,
  13. ) -> List[Dict[str,str]]:
  14. info = []
  15. pattern = re.compile(r"BUR ?\d")
  16. i = 0
  17. last_excep = None
  18. while i < max_tries:
  19. try:
  20. driver.get(url)
  21. html = BeautifulSoup(driver.page_source, "html.parser")
  22. subtree = html.find(class_="document-title")
  23. title = subtree.find("span").contents[0]
  24. break
  25. except (AttributeError, WebDriverException) as excep:
  26. last_excep = excep
  27. print(f"Error fetching {url}")
  28. print("Retrying ...")
  29. time.sleep(randrange(5, 15))
  30. i += 1
  31. continue
  32. if i == max_tries:
  33. print(f"Aborting after {max_tries} tries.")
  34. print(last_excep)
  35. else:
  36. match = pattern.search(title)
  37. if match:
  38. kind = match.group(0).replace(" ", "")
  39. else:
  40. kind = None
  41. # TODO: might improve speed by first searching for class="document-line" and then operating on thie resulting subtree for the info
  42. try:
  43. subtree = html.find_all(
  44. class_="field field--name-field-document-country field--type-termstore-entity-reference field--label-inline")
  45. country = subtree[0].find(class_="field--item").contents[0]
  46. except AttributeError:
  47. # author as backup for country
  48. subtree = html.find_all(class_="field--name-field-document-ca")
  49. country = subtree[0].find(class_="field--item").contents[0]
  50. # document type
  51. subtree = html.find_all(
  52. class_="field field--name-field-document-type field--type-termstore-entity-reference field--label-hidden field--items")
  53. doctype = subtree[0].find(class_="field--item").contents[0]
  54. # get files
  55. sub_files = html.find(
  56. class_=["form-select form-control", "form-select form-control download"])
  57. files = sub_files.find_all("option", value=True)
  58. files = [file.attrs['value'] for file in files]
  59. if len(files) > 0:
  60. for file in files:
  61. if not kind:
  62. match = pattern.search(file.upper())
  63. if match:
  64. kind = match.group(0)
  65. else:
  66. # TODO: check why search in filename makes sense (compared to
  67. # directly using doctype)
  68. if ("CRF" in doctype) or ("CRF" in title):
  69. kind = "CRF"
  70. elif ("SEF" in doctype) or ("SEF" in title):
  71. kind = "SEF"
  72. elif ("NIR" in doctype) or ("NIR" in title):
  73. kind = "NIR"
  74. elif "NC" in title:
  75. kind = "NC"
  76. elif "Status report" in title:
  77. kind = "CRF"
  78. else:
  79. kind = "other"
  80. info.append({
  81. "Kind": kind,
  82. "Country": country,
  83. "Title": title,
  84. "URL": file,
  85. })
  86. print("\t".join([kind, country, title, file]))
  87. else:
  88. print(f"No files found for {url}")
  89. return info