fetch_submissions_annexI.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import argparse
  2. import time
  3. import pandas as pd
  4. from pathlib import Path
  5. from bs4 import BeautifulSoup
  6. from selenium.webdriver import Firefox
  7. from selenium.webdriver.firefox.options import Options
  8. from random import randrange
  9. from unfccc_submission_info import get_unfccc_submission_info
  10. from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
  11. max_tries = 10
  12. descr = ("Download UNFCCC National Inventory Submissions lists "
  13. "and create list of submissions as CSV file. Based on "
  14. "process.py from national-inventory-submissions "
  15. "(https://github.com/openclimatedata/national-inventory-submisions)")
  16. parser = argparse.ArgumentParser(description=descr)
  17. parser.add_argument(
  18. '--year',
  19. help='Year to download'
  20. )
  21. args = parser.parse_args()
  22. year = args.year
  23. print("Fetching submissions for {}".format(year))
  24. # TODO: move to utils as used in two places
  25. if int(year) == 2019:
  26. url = (
  27. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  28. "reporting-and-review-under-the-convention/"
  29. "greenhouse-gas-inventories-annex-i-parties/"
  30. "national-inventory-submissions-{}".format(year)
  31. )
  32. elif int(year) in range(2020,2023):
  33. url = (
  34. "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
  35. )
  36. elif int(year) >= 2023:
  37. url = (
  38. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  39. "reporting-and-review-under-the-convention/"
  40. "greenhouse-gas-inventories-annex-i-parties/"
  41. "national-inventory-submissions-{}".format(year)
  42. )
  43. else:
  44. url = (
  45. "https://unfccc.int/process/transparency-and-reporting/"
  46. "reporting-and-review-under-the-convention/"
  47. "greenhouse-gas-inventories-annex-i-parties/"
  48. "submissions/national-inventory-submissions-{}".format(year)
  49. )
  50. print(f"Using {url} to get submissions list")
  51. # set options for headless mode
  52. profile_path = ".firefox"
  53. options = Options()
  54. options.add_argument('-headless')
  55. # create profile for headless mode and automatic downloading
  56. options.set_preference('profile', profile_path)
  57. # set up selenium driver
  58. driver = Firefox(options=options)
  59. driver.get(url)
  60. html = BeautifulSoup(driver.page_source, "html.parser")
  61. table = html.find("table")
  62. # check if table found. if not the get command didn't work, likely because of a captcha on the site
  63. ### TODO replace by error message
  64. if not table:
  65. # try to load html file from disk
  66. print('Download failed, trying to load manually downloaded file')
  67. file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
  68. content = file.read()
  69. html = BeautifulSoup(content, "html.parser")
  70. table = html.find("table")
  71. if not table:
  72. print(
  73. "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
  74. " not found")
  75. exit()
  76. links = table.findAll('a')
  77. targets = [] # sub-pages
  78. downloads = []
  79. no_downloads = []
  80. # Check links for Zipfiles or subpages
  81. for link in links:
  82. if "href" not in link.attrs:
  83. continue
  84. href = link.attrs["href"]
  85. if "/documents/" in href:
  86. if "title" in link.attrs.keys():
  87. title = link.attrs["title"]
  88. else:
  89. title = link.contents[0]
  90. if href.startswith("/documents"):
  91. href = "https://unfccc.int" + href
  92. # Only add pages in the format https://unfccc.int/documents/65587
  93. # to further downloads
  94. if str(Path(href).parent).endswith("documents"):
  95. targets.append({"title": title, "url": href})
  96. elif href.endswith(".zip"):
  97. if href.startswith("/files"):
  98. href = "https://unfccc.int" + href
  99. country = Path(href).name.split("-")[0].upper()
  100. title = f"{country} {link.contents[0]}"
  101. filename = Path(href).name
  102. file_parts = filename.split('-')
  103. if len(file_parts) >= 2:
  104. kind = file_parts[2].upper()
  105. elif filename.startswith('asr'):
  106. kind = 'CRF'
  107. else:
  108. kind = None
  109. print("\t".join([kind, country, title, href]))
  110. downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
  111. # Go through sub-pages.
  112. for target in targets:
  113. time.sleep(randrange(5, 15))
  114. url = target["url"]
  115. submission_info = get_unfccc_submission_info(url, driver, 10)
  116. if submission_info:
  117. downloads = downloads + submission_info
  118. else:
  119. no_downloads.append({target["title"], url})
  120. if len(no_downloads) > 0:
  121. print("No downloads for ", no_downloads)
  122. driver.close()
  123. df = pd.DataFrame(downloads)
  124. df.to_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv", index=False)