fetch_submissions_nc.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium import webdriver
  8. from random import randrange
  9. root = Path(__file__).parents[2]
  10. """
  11. Download UNFCCC Biennial Update Report submissions
  12. from Non-Annex I Parties and create list of submissions as CSV file
  13. Based on `process_bur` from national-inventory-submissions
  14. (https://github.com/openclimatedata/national-inventory-submisions)
  15. """
  16. # TODO for NC
  17. ## link is just /documents/XXXXX (but already dealt with in code below)
  18. ## url is https://unfccc.int/non-annex-I-NCs
  19. ## pattern needs NC instead of BUR
  20. print("Fetching NC submissions ...")
  21. url = "https://unfccc.int/non-annex-I-NCs"
  22. #print(url)
  23. # set options for headless mode
  24. options = webdriver.firefox.options.Options()
  25. options.add_argument('-headless')
  26. # create profile for headless mode and automatic downloading
  27. profile = webdriver.FirefoxProfile()
  28. # set up selenium driver
  29. driver = webdriver.Firefox(options=options, firefox_profile=profile)
  30. driver.get(url)
  31. html = BeautifulSoup(driver.page_source, "html.parser")
  32. table = html.find_all("table")[1]
  33. links = table.findAll("a")
  34. targets = [] # sub-pages
  35. downloads = []
  36. no_downloads = []
  37. # Check links for Zipfiles or subpages
  38. for link in links:
  39. if "href" not in link.attrs:
  40. continue
  41. href = link.attrs["href"]
  42. if "/documents/" in href:
  43. if "title" in link.attrs.keys():
  44. title = link.attrs["title"]
  45. else:
  46. title = link.contents[0]
  47. if href.startswith("/documents"):
  48. href = "https://unfccc.int" + href
  49. # Only add pages in the format https://unfccc.int/documents/65587
  50. # to further downloads
  51. if str(Path(href).parent).endswith("documents"):
  52. targets.append({"title": title, "url": href})
  53. pattern = re.compile(r"NC ?\d")
  54. #skip = True
  55. # Go through sub-pages.
  56. for target in targets:
  57. #if target["url"] == "https://unfccc.int/documents/199234":
  58. # skip = False
  59. #if skip:
  60. # print(f"Skipping { target['title']}")
  61. # continue
  62. time.sleep(randrange(5, 15))
  63. url = target["url"]
  64. #subpage = requests.get(url, timeout=15.5)
  65. driver.get(url)
  66. html = BeautifulSoup(driver.page_source, "html.parser")
  67. title = html.find("h1").contents[0]
  68. match = pattern.search(title)
  69. if match:
  70. kind = match.group(0).replace(" ", "")
  71. else:
  72. kind = None
  73. h2 = html.find("h2", text="Versions")
  74. if h2:
  75. div = h2.findNext("div")
  76. links = div.findAll("a")
  77. try:
  78. country = (
  79. html.find("h2", text="Countries").findNext("div").findNext("div").text
  80. )
  81. except AttributeError:
  82. country = (
  83. html.find("h2", text="Corporate Author")
  84. .findNext("div")
  85. .findNext("div")
  86. .text
  87. )
  88. doctype = (
  89. html.find("h2", text="Document Type").findNext("div").findNext("div").text
  90. )
  91. for link in links:
  92. url = link.attrs["href"]
  93. if not kind:
  94. match = pattern.search(url.upper())
  95. if match:
  96. kind = match.group(0)
  97. else:
  98. if ("NIR" in doctype) or ("NIR" in title):
  99. kind = "NIR"
  100. elif ("INV" in title) or ("Inventory" in title):
  101. kind = "INV"
  102. else:
  103. print("found unknown record" + url)
  104. downloads.append(
  105. {
  106. "Kind": kind,
  107. "Country": country,
  108. "Title": title,
  109. "URL": url,
  110. }
  111. )
  112. print("\t".join([kind, country, title, url]))
  113. else:
  114. no_downloads.append((title, url))
  115. if len(no_downloads) > 0:
  116. print("No downloads for ", no_downloads)
  117. driver.close()
  118. df = pd.DataFrame(downloads)
  119. df = df[["Kind", "Country", "Title", "URL"]]
  120. df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)