fetch_submissions_nc.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #import requests
  2. import time
  3. import pandas as pd
  4. import re
  5. from pathlib import Path
  6. from bs4 import BeautifulSoup
  7. from selenium.webdriver import Firefox
  8. from selenium.webdriver.firefox.options import Options
  9. from random import randrange
  10. from UNFCCC_GHG_data.UNFCCC_downloader import \
  11. get_unfccc_submission_info
  12. from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
  13. """
  14. Download UNFCCC Biennial Update Report submissions
  15. from Non-Annex I Parties and create list of submissions as CSV file
  16. Based on `process_bur` from national-inventory-submissions
  17. (https://github.com/openclimatedata/national-inventory-submisions)
  18. """
  19. print("Fetching NC submissions ...")
  20. url = "https://unfccc.int/non-annex-I-NCs"
  21. #print(url)
  22. # set options for headless mode
  23. profile_path = ".firefox"
  24. options = Options()
  25. options.add_argument('-headless')
  26. # create profile for headless mode and automatic downloading
  27. options.set_preference('profile', profile_path)
  28. # set up selenium driver
  29. driver = Firefox(options=options)
  30. driver.get(url)
  31. html = BeautifulSoup(driver.page_source, "html.parser")
  32. table = html.find_all("table")[1]
  33. links = table.findAll("a")
  34. targets = [] # sub-pages
  35. downloads = []
  36. no_downloads = []
  37. # Check links for Zipfiles or subpages
  38. for link in links:
  39. if "href" not in link.attrs:
  40. continue
  41. href = link.attrs["href"]
  42. if "/documents/" in href:
  43. if "title" in link.attrs.keys():
  44. title = link.attrs["title"]
  45. else:
  46. title = link.contents[0]
  47. if href.startswith("/documents"):
  48. href = "https://unfccc.int" + href
  49. # Only add pages in the format https://unfccc.int/documents/65587
  50. # to further downloads
  51. if str(Path(href).parent).endswith("documents"):
  52. targets.append({"title": title, "url": href})
  53. pattern = re.compile(r"NC ?\d")
  54. # Go through sub-pages.
  55. for target in targets:
  56. time.sleep(randrange(5, 15))
  57. url = target["url"]
  58. submission_info = get_unfccc_submission_info(url, driver, 10)
  59. if submission_info:
  60. downloads = downloads + submission_info
  61. else:
  62. no_downloads.append({target["title"], url})
  63. if len(no_downloads) > 0:
  64. print("No downloads for ", no_downloads)
  65. driver.close()
  66. df = pd.DataFrame(downloads)
  67. df = df[["Kind", "Country", "Title", "URL"]]
  68. df.to_csv(downloaded_data_path_UNFCCC / "submissions-nc.csv", index=False)