fetch_submissions_bur.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium import webdriver
  8. from random import randrange
  9. root = Path(__file__).parents[2]
  10. """
  11. Download UNFCCC Biennial Update Report submissions
  12. from Non-Annex I Parties and create list of submissions as CSV file
  13. Based on `process_bur` from national-inventory-submissions
  14. (https://github.com/openclimatedata/national-inventory-submisions)
  15. """
  16. # TODO for NC
  17. ## link is just /documents/XXXXX (but already dealt with in code below)
  18. ## url is https://unfccc.int/non-annex-I-NCs
  19. ## pattern needs NC instead of BUR
  20. print("Fetching BUR submissions ...")
  21. url = "https://unfccc.int/BURs"
  22. #print(url)
  23. # set options for headless mode
  24. options = webdriver.firefox.options.Options()
  25. options.add_argument('-headless')
  26. # create profile for headless mode and automatic downloading
  27. profile = webdriver.FirefoxProfile()
  28. # set up selenium driver
  29. driver = webdriver.Firefox(options=options, firefox_profile=profile)
  30. driver.get(url)
  31. html = BeautifulSoup(driver.page_source, "html.parser")
  32. table = html.find_all("table")[1]
  33. links = table.findAll("a")
  34. targets = [] # sub-pages
  35. downloads = []
  36. no_downloads = []
  37. # Check links for Zipfiles or subpages
  38. for link in links:
  39. if "href" not in link.attrs:
  40. continue
  41. href = link.attrs["href"]
  42. if "/documents/" in href:
  43. if "title" in link.attrs.keys():
  44. title = link.attrs["title"]
  45. else:
  46. title = link.contents[0]
  47. if href.startswith("/documents"):
  48. href = "https://unfccc.int" + href
  49. # Only add pages in the format https://unfccc.int/documents/65587
  50. # to further downloads
  51. if str(Path(href).parent).endswith("documents"):
  52. targets.append({"title": title, "url": href})
  53. pattern = re.compile(r"BUR ?\d")
  54. # Go through sub-pages.
  55. for target in targets:
  56. time.sleep(randrange(5, 15))
  57. url = target["url"]
  58. #subpage = requests.get(url, timeout=15.5)
  59. driver.get(url)
  60. html = BeautifulSoup(driver.page_source, "html.parser")
  61. title = html.find("h1").contents[0]
  62. match = pattern.search(title)
  63. if match:
  64. kind = match.group(0).replace(" ", "")
  65. else:
  66. kind = None
  67. h2 = html.find("h2", text="Versions")
  68. if h2:
  69. div = h2.findNext("div")
  70. links = div.findAll("a")
  71. try:
  72. country = (
  73. html.find("h2", text="Countries").findNext("div").findNext("div").text
  74. )
  75. except AttributeError:
  76. country = (
  77. html.find("h2", text="Corporate Author")
  78. .findNext("div")
  79. .findNext("div")
  80. .text
  81. )
  82. doctype = (
  83. html.find("h2", text="Document Type").findNext("div").findNext("div").text
  84. )
  85. for link in links:
  86. url = link.attrs["href"]
  87. if not kind:
  88. match = pattern.search(url.upper())
  89. if match:
  90. kind = match.group(0)
  91. else:
  92. if ("NIR" in doctype) or ("NIR" in title):
  93. kind = "NIR"
  94. elif "NC" in title:
  95. kind = "NC"
  96. downloads.append(
  97. {
  98. "Kind": kind,
  99. "Country": country,
  100. "Title": title,
  101. "URL": url,
  102. }
  103. )
  104. print("\t".join([kind, country, title, url]))
  105. else:
  106. no_downloads.append((title, url))
  107. if len(no_downloads) > 0:
  108. print("No downloads for ", no_downloads)
  109. driver.close()
  110. df = pd.DataFrame(downloads)
  111. df = df[["Kind", "Country", "Title", "URL"]]
  112. df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)