unfccc_submission_info.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # helper functions to gather submission info from UNFCCC website
  2. import time
  3. import re
  4. from random import randrange
  5. from typing import Dict, List
  6. from selenium.webdriver import Firefox
  7. from bs4 import BeautifulSoup
  8. def get_unfccc_submission_info(
  9. url: str,
  10. driver: Firefox,
  11. max_tries: int=10,
  12. ) -> List[Dict[str,str]]:
  13. info = []
  14. pattern = re.compile(r"BUR ?\d")
  15. i = 0
  16. while i < max_tries:
  17. try:
  18. driver.get(url)
  19. html = BeautifulSoup(driver.page_source, "html.parser")
  20. title = html.find("h1").contents[0]
  21. break
  22. except AttributeError:
  23. print(f"Error fetching {url}")
  24. print("Retrying ...")
  25. time.sleep(randrange(5, 15))
  26. i += 1
  27. continue
  28. if i == max_tries:
  29. print(f"Aborting after {max_tries} tries")
  30. else:
  31. match = pattern.search(title)
  32. if match:
  33. kind = match.group(0).replace(" ", "")
  34. else:
  35. kind = None
  36. h2 = html.find("h2", text="Versions")
  37. if h2:
  38. div = h2.findNext("div")
  39. links = div.findAll("a")
  40. try:
  41. country = (
  42. html.find("h2", text="Countries").findNext("div").findNext("div").text
  43. )
  44. except AttributeError:
  45. country = (
  46. html.find("h2", text="Corporate Author")
  47. .findNext("div")
  48. .findNext("div")
  49. .text
  50. )
  51. doctype = (
  52. html.find("h2", text="Document Type").findNext("div").findNext("div").text
  53. )
  54. for link in links:
  55. url = link.attrs["href"]
  56. if not kind:
  57. match = pattern.search(url.upper())
  58. if match:
  59. kind = match.group(0)
  60. else:
  61. if ("CRF" in doctype) or ("CRF" in title):
  62. kind = "CRF"
  63. elif ("SEF" in doctype) or ("SEF" in title):
  64. kind = "SEF"
  65. elif ("NIR" in doctype) or ("NIR" in title):
  66. kind = "NIR"
  67. elif "NC" in title:
  68. kind = "NC"
  69. elif "Status report" in title:
  70. kind = "CRF"
  71. else:
  72. kind = "other"
  73. info.append({
  74. "Kind": kind,
  75. "Country": country,
  76. "Title": title,
  77. "URL": url,
  78. })
  79. print("\t".join([kind, country, title, url]))
  80. else:
  81. print(f"No files found for {url}")
  82. return info