unfccc_submission_info.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. # helper functions to gather submission info from UNFCCC website
  2. import time
  3. import re
  4. from random import randrange
  5. from typing import Dict, List
  6. from selenium.webdriver import Firefox
  7. from selenium.common.exceptions import WebDriverException
  8. from bs4 import BeautifulSoup
  9. def get_unfccc_submission_info(
  10. url: str,
  11. driver: Firefox,
  12. max_tries: int=10,
  13. ) -> List[Dict[str,str]]:
  14. info = []
  15. pattern = re.compile(r"BUR ?\d")
  16. pattern_NC = re.compile(r"NC ?\d")
  17. i = 0
  18. last_excep = None
  19. while i < max_tries:
  20. try:
  21. driver.get(url)
  22. html = BeautifulSoup(driver.page_source, "html.parser")
  23. subtree = html.find(class_="document-title")
  24. title = subtree.find("span").contents[0]
  25. break
  26. except (AttributeError, WebDriverException) as excep:
  27. last_excep = excep
  28. print(f"Error fetching {url}")
  29. print("Retrying ...")
  30. time.sleep(randrange(5, 15))
  31. i += 1
  32. continue
  33. if i == max_tries:
  34. print(f"Aborting after {max_tries} tries.")
  35. print(last_excep)
  36. else:
  37. match = pattern.search(title)
  38. if match:
  39. kind = match.group(0).replace(" ", "")
  40. else:
  41. match = pattern_NC.search(title)
  42. if match:
  43. kind = match.group(0).replace(" ", "")
  44. else:
  45. kind = None
  46. # TODO: might improve speed by first searching for class="document-line" and then operating on thie resulting subtree for the info
  47. try:
  48. subtree = html.find_all(
  49. class_="field field--name-field-document-country field--type-termstore-entity-reference field--label-inline")
  50. country = subtree[0].find(class_="field--item").contents[0]
  51. except (AttributeError, IndexError) as e:
  52. # author as backup for country
  53. subtree = html.find_all(class_="field--name-field-document-ca")
  54. country = subtree[0].find(class_="field--item").contents[0]
  55. # document type
  56. subtree = html.find_all(
  57. class_="field field--name-field-document-type field--type-termstore-entity-reference field--label-hidden field--items")
  58. doctype = subtree[0].find(class_="field--item").contents[0]
  59. # get files
  60. sub_files = html.find(
  61. class_=["form-select form-control", "form-select form-control download"])
  62. if sub_files:
  63. files = sub_files.find_all("option", value=True)
  64. files = [file.attrs['value'] for file in files]
  65. else:
  66. files = []
  67. if len(files) > 0:
  68. for file in files:
  69. if not kind:
  70. match = pattern.search(file.upper())
  71. if match:
  72. kind = match.group(0)
  73. else:
  74. match = pattern_NC.search(file.upper())
  75. if match:
  76. kind = match.group(0).replace(" ", "")
  77. else:
  78. if ("CRT" in doctype) or ("CRT" in title):
  79. kind = "CRT"
  80. elif ("NID" in doctype) or ("NID" in title):
  81. kind = "NID"
  82. elif ("NIR" in doctype) or ("NIR" in title):
  83. kind = "NIR"
  84. elif ("BRT" in doctype) or ("BTR" in title):
  85. kind = "BTR"
  86. else:
  87. kind = "other"
  88. info.append({
  89. "Kind": kind,
  90. "Country": country,
  91. "Title": title,
  92. "URL": file,
  93. })
  94. print("\t".join([kind, country, title, file]))
  95. else:
  96. print(f"No files found for {url}")
  97. return info
  98. def get_BTR_name_and_URL(submission_round: int) -> (str, str):
  99. """
  100. Get the name and URL of a BTR for a given number
  101. Parameters
  102. ----------
  103. submission_round (int)
  104. submission_round of the BTRs e.g. 1
  105. Returns
  106. -------
  107. name (str): name of the BTR submission round, e.g. 'first'
  108. URL (str): URL of the submission page on the UNFCCC website
  109. """
  110. if submission_round == 1:
  111. name = "first"
  112. URL = "https://unfccc.int/first-biennial-transparency-reports"
  113. else:
  114. raise ValueError(f"Submission round {submission_round} is not defined")
  115. return name, URL