fetch_submissions_annexI.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. import argparse
  2. import time
  3. import pandas as pd
  4. from pathlib import Path
  5. from bs4 import BeautifulSoup
  6. from selenium.webdriver import Firefox
  7. from selenium.webdriver.firefox.options import Options
  8. from random import randrange
  9. from unfccc_submission_info import get_unfccc_submission_info
  10. root = Path(__file__).absolute().parents[2]
  11. max_tries = 10
  12. descr = ("Download UNFCCC National Inventory Submissions lists "
  13. "and create list of submissions as CSV file. Based on "
  14. "process.py from national-inventory-submissions "
  15. "(https://github.com/openclimatedata/national-inventory-submisions)")
  16. parser = argparse.ArgumentParser(description=descr)
  17. parser.add_argument(
  18. '--year',
  19. help='Year to download'
  20. )
  21. args = parser.parse_args()
  22. year = args.year
  23. print("Fetching submissions for {}".format(year))
  24. url = (
  25. "https://unfccc.int/process/transparency-and-reporting/"
  26. "reporting-and-review-under-the-convention/"
  27. "greenhouse-gas-inventories-annex-i-parties/"
  28. "submissions/national-inventory-submissions-{}".format(year)
  29. )
  30. if int(year) == 2019:
  31. url = (
  32. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  33. "reporting-and-review-under-the-convention/"
  34. "greenhouse-gas-inventories-annex-i-parties/"
  35. "national-inventory-submissions-{}".format(year)
  36. )
  37. if int(year) >= 2020:
  38. url = (
  39. "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
  40. )
  41. print(f"Using {url} to get submissions list")
  42. # set options for headless mode
  43. profile_path = ".firefox"
  44. options = Options()
  45. options.add_argument('-headless')
  46. # create profile for headless mode and automatic downloading
  47. options.set_preference('profile', profile_path)
  48. # set up selenium driver
  49. driver = Firefox(options=options)
  50. driver.get(url)
  51. html = BeautifulSoup(driver.page_source, "html.parser")
  52. table = html.find("table")
  53. # check if table found. if not the get command didn't work, likely because of a captcha on the site
  54. ### TODO replace by error message
  55. if not table:
  56. # try to load html file from disk
  57. print('Download failed, trying to load manually downloaded file')
  58. file = open("manual_page_downloads/National-Inventory-Submissions-{}.html".format(year))
  59. content = file.read()
  60. html = BeautifulSoup(content, "html.parser")
  61. table = html.find("table")
  62. if not table:
  63. print(
  64. "Manually downloaded file " + "manual_page_downloads/National-Inventory-Submissions-{}.html".format(year) +
  65. " not found")
  66. exit()
  67. links = table.findAll('a')
  68. targets = [] # sub-pages
  69. downloads = []
  70. no_downloads = []
  71. # Check links for Zipfiles or subpages
  72. for link in links:
  73. if "href" not in link.attrs:
  74. continue
  75. href = link.attrs["href"]
  76. if "/documents/" in href:
  77. if "title" in link.attrs.keys():
  78. title = link.attrs["title"]
  79. else:
  80. title = link.contents[0]
  81. if href.startswith("/documents"):
  82. href = "https://unfccc.int" + href
  83. # Only add pages in the format https://unfccc.int/documents/65587
  84. # to further downloads
  85. if str(Path(href).parent).endswith("documents"):
  86. targets.append({"title": title, "url": href})
  87. elif href.endswith(".zip"):
  88. if href.startswith("/files"):
  89. href = "https://unfccc.int" + href
  90. country = Path(href).name.split("-")[0].upper()
  91. title = f"{country} {link.contents[0]}"
  92. filename = Path(href).name
  93. file_parts = filename.split('-')
  94. if len(file_parts) >= 2:
  95. kind = file_parts[2].upper()
  96. elif filename.startswith('asr'):
  97. kind = 'CRF'
  98. else:
  99. kind = None
  100. print("\t".join([kind, country, title, href]))
  101. downloads.append({"Kind": kind, "Country": country, "Title": title, "URL": href})
  102. # Go through sub-pages.
  103. for target in targets:
  104. time.sleep(randrange(5, 15))
  105. url = target["url"]
  106. submission_info = get_unfccc_submission_info(url, driver, 10)
  107. if submission_info:
  108. downloads = downloads + submission_info
  109. else:
  110. no_downloads.append({target["title"], url})
  111. if len(no_downloads) > 0:
  112. print("No downloads for ", no_downloads)
  113. driver.close()
  114. df = pd.DataFrame(downloads)
  115. df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)