fetch_submissions_nc.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium.webdriver import Firefox
  8. from selenium.webdriver.firefox.options import Options
  9. from random import randrange
  10. from unfccc_submission_info import get_unfccc_submission_info
  11. root = Path(__file__).absolute().parents[2]
  12. """
  13. Download UNFCCC Biennial Update Report submissions
  14. from Non-Annex I Parties and create list of submissions as CSV file
  15. Based on `process_bur` from national-inventory-submissions
  16. (https://github.com/openclimatedata/national-inventory-submisions)
  17. """
  18. print("Fetching NC submissions ...")
  19. url = "https://unfccc.int/non-annex-I-NCs"
  20. #print(url)
  21. # set options for headless mode
  22. profile_path = ".firefox"
  23. options = Options()
  24. options.add_argument('-headless')
  25. # create profile for headless mode and automatic downloading
  26. options.set_preference('profile', profile_path)
  27. # set up selenium driver
  28. driver = Firefox(options=options)
  29. driver.get(url)
  30. html = BeautifulSoup(driver.page_source, "html.parser")
  31. table = html.find_all("table")[1]
  32. links = table.findAll("a")
  33. targets = [] # sub-pages
  34. downloads = []
  35. no_downloads = []
  36. # Check links for Zipfiles or subpages
  37. for link in links:
  38. if "href" not in link.attrs:
  39. continue
  40. href = link.attrs["href"]
  41. if "/documents/" in href:
  42. if "title" in link.attrs.keys():
  43. title = link.attrs["title"]
  44. else:
  45. title = link.contents[0]
  46. if href.startswith("/documents"):
  47. href = "https://unfccc.int" + href
  48. # Only add pages in the format https://unfccc.int/documents/65587
  49. # to further downloads
  50. if str(Path(href).parent).endswith("documents"):
  51. targets.append({"title": title, "url": href})
  52. pattern = re.compile(r"NC ?\d")
  53. # Go through sub-pages.
  54. for target in targets:
  55. time.sleep(randrange(5, 15))
  56. url = target["url"]
  57. submission_info = get_unfccc_submission_info(url, driver, 10)
  58. if submission_info:
  59. downloads = downloads + submission_info
  60. else:
  61. no_downloads.append({target["title"], url})
  62. if len(no_downloads) > 0:
  63. print("No downloads for ", no_downloads)
  64. driver.close()
  65. df = pd.DataFrame(downloads)
  66. df = df[["Kind", "Country", "Title", "URL"]]
  67. df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)