fetch_submissions_nc.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium.webdriver import Firefox
  8. from selenium.webdriver.firefox.options import Options
  9. from random import randrange
  10. root = Path(__file__).parents[2]
  11. """
  12. Download UNFCCC Biennial Update Report submissions
  13. from Non-Annex I Parties and create list of submissions as CSV file
  14. Based on `process_bur` from national-inventory-submissions
  15. (https://github.com/openclimatedata/national-inventory-submisions)
  16. """
  17. print("Fetching NC submissions ...")
  18. url = "https://unfccc.int/non-annex-I-NCs"
  19. #print(url)
  20. # set options for headless mode
  21. profile_path = ".firefox"
  22. options = Options()
  23. options.add_argument('-headless')
  24. # create profile for headless mode and automatic downloading
  25. options.set_preference('profile', profile_path)
  26. # set up selenium driver
  27. driver = Firefox(options=options)
  28. driver.get(url)
  29. html = BeautifulSoup(driver.page_source, "html.parser")
  30. table = html.find_all("table")[1]
  31. links = table.findAll("a")
  32. targets = [] # sub-pages
  33. downloads = []
  34. no_downloads = []
  35. # Check links for Zipfiles or subpages
  36. for link in links:
  37. if "href" not in link.attrs:
  38. continue
  39. href = link.attrs["href"]
  40. if "/documents/" in href:
  41. if "title" in link.attrs.keys():
  42. title = link.attrs["title"]
  43. else:
  44. title = link.contents[0]
  45. if href.startswith("/documents"):
  46. href = "https://unfccc.int" + href
  47. # Only add pages in the format https://unfccc.int/documents/65587
  48. # to further downloads
  49. if str(Path(href).parent).endswith("documents"):
  50. targets.append({"title": title, "url": href})
  51. pattern = re.compile(r"NC ?\d")
  52. #skip = True
  53. # Go through sub-pages.
  54. for target in targets:
  55. #if target["url"] == "https://unfccc.int/documents/199234":
  56. # skip = False
  57. #if skip:
  58. # print(f"Skipping { target['title']}")
  59. # continue
  60. time.sleep(randrange(5, 15))
  61. url = target["url"]
  62. #subpage = requests.get(url, timeout=15.5)
  63. driver.get(url)
  64. html = BeautifulSoup(driver.page_source, "html.parser")
  65. title = html.find("h1").contents[0]
  66. match = pattern.search(title)
  67. if match:
  68. kind = match.group(0).replace(" ", "")
  69. else:
  70. kind = None
  71. h2 = html.find("h2", text="Versions")
  72. if h2:
  73. div = h2.findNext("div")
  74. links = div.findAll("a")
  75. try:
  76. country = (
  77. html.find("h2", text="Countries").findNext("div").findNext("div").text
  78. )
  79. except AttributeError:
  80. country = (
  81. html.find("h2", text="Corporate Author")
  82. .findNext("div")
  83. .findNext("div")
  84. .text
  85. )
  86. doctype = (
  87. html.find("h2", text="Document Type").findNext("div").findNext("div").text
  88. )
  89. for link in links:
  90. url = link.attrs["href"]
  91. if not kind:
  92. match = pattern.search(url.upper())
  93. if match:
  94. kind = match.group(0)
  95. else:
  96. if ("NIR" in doctype) or ("NIR" in title):
  97. kind = "NIR"
  98. elif ("INV" in title) or ("Inventory" in title):
  99. kind = "INV"
  100. else:
  101. print("found unknown record" + url)
  102. downloads.append(
  103. {
  104. "Kind": kind,
  105. "Country": country,
  106. "Title": title,
  107. "URL": url,
  108. }
  109. )
  110. print("\t".join([kind, country, title, url]))
  111. else:
  112. no_downloads.append((title, url))
  113. if len(no_downloads) > 0:
  114. print("No downloads for ", no_downloads)
  115. driver.close()
  116. df = pd.DataFrame(downloads)
  117. df = df[["Kind", "Country", "Title", "URL"]]
  118. df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)