fetch_submissions_bur.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium.webdriver import Firefox
  8. from selenium.webdriver.firefox.options import Options
  9. from random import randrange
  10. root = Path(__file__).parents[2]
  11. """
  12. Download UNFCCC Biennial Update Report submissions
  13. from Non-Annex I Parties and create list of submissions as CSV file
  14. Based on `process_bur` from national-inventory-submissions
  15. (https://github.com/openclimatedata/national-inventory-submisions)
  16. """
  17. print("Fetching BUR submissions ...")
  18. url = "https://unfccc.int/BURs"
  19. #print(url)
  20. # set options for headless mode
  21. profile_path = ".firefox"
  22. options = Options()
  23. options.add_argument('-headless')
  24. # create profile for headless mode and automatic downloading
  25. options.set_preference('profile', profile_path)
  26. # set up selenium driver
  27. driver = Firefox(options=options)
  28. driver.get(url)
  29. html = BeautifulSoup(driver.page_source, "html.parser")
  30. table = html.find_all("table")[1]
  31. links = table.findAll("a")
  32. targets = [] # sub-pages
  33. downloads = []
  34. no_downloads = []
  35. # Check links for Zipfiles or subpages
  36. for link in links:
  37. if "href" not in link.attrs:
  38. continue
  39. href = link.attrs["href"]
  40. if "/documents/" in href:
  41. if "title" in link.attrs.keys():
  42. title = link.attrs["title"]
  43. else:
  44. title = link.contents[0]
  45. if href.startswith("/documents"):
  46. href = "https://unfccc.int" + href
  47. # Only add pages in the format https://unfccc.int/documents/65587
  48. # to further downloads
  49. if str(Path(href).parent).endswith("documents"):
  50. targets.append({"title": title, "url": href})
  51. pattern = re.compile(r"BUR ?\d")
  52. # Go through sub-pages.
  53. for target in targets:
  54. time.sleep(randrange(5, 15))
  55. url = target["url"]
  56. #subpage = requests.get(url, timeout=15.5)
  57. driver.get(url)
  58. html = BeautifulSoup(driver.page_source, "html.parser")
  59. title = html.find("h1").contents[0]
  60. match = pattern.search(title)
  61. if match:
  62. kind = match.group(0).replace(" ", "")
  63. else:
  64. kind = None
  65. h2 = html.find("h2", text="Versions")
  66. if h2:
  67. div = h2.findNext("div")
  68. links = div.findAll("a")
  69. try:
  70. country = (
  71. html.find("h2", text="Countries").findNext("div").findNext("div").text
  72. )
  73. except AttributeError:
  74. country = (
  75. html.find("h2", text="Corporate Author")
  76. .findNext("div")
  77. .findNext("div")
  78. .text
  79. )
  80. doctype = (
  81. html.find("h2", text="Document Type").findNext("div").findNext("div").text
  82. )
  83. for link in links:
  84. url = link.attrs["href"]
  85. if not kind:
  86. match = pattern.search(url.upper())
  87. if match:
  88. kind = match.group(0)
  89. else:
  90. if ("NIR" in doctype) or ("NIR" in title):
  91. kind = "NIR"
  92. elif "NC" in title:
  93. kind = "NC"
  94. downloads.append(
  95. {
  96. "Kind": kind,
  97. "Country": country,
  98. "Title": title,
  99. "URL": url,
  100. }
  101. )
  102. print("\t".join([kind, country, title, url]))
  103. else:
  104. no_downloads.append((title, url))
  105. if len(no_downloads) > 0:
  106. print("No downloads for ", no_downloads)
  107. driver.close()
  108. df = pd.DataFrame(downloads)
  109. df = df[["Kind", "Country", "Title", "URL"]]
  110. df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)