download_annexI.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. import zipfile
  8. from datetime import date
  9. from selenium.webdriver import Firefox
  10. from selenium.webdriver.firefox.options import Options
  11. from random import randrange
  12. from pathlib import Path
  13. root = Path(__file__).parents[2]
  14. ###############
  15. #
  16. # TODO
  17. # download directly via selenium see link below
  18. # https://sqa.stackexchange.com/questions/2197/
  19. # how-to-download-a-file-using-seleniums-webdriver
  20. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  21. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  22. ###############
  23. descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
  24. 'Based on download.py from national-inventory-submissions ' \
  25. '(https://github.com/openclimatedata/national-inventory-submisions)'
  26. parser = argparse.ArgumentParser(description=descr)
  27. parser.add_argument(
  28. '--category',
  29. help='Category to download, CRF, NIR, SEF'
  30. )
  31. parser.add_argument(
  32. '--year',
  33. help='Year to download'
  34. )
  35. args = parser.parse_args()
  36. year = args.year
  37. category = args.category.upper()
  38. dataset = category + year
  39. print(f"Downloading data for {dataset}")
  40. # generate the correct url
  41. url = (
  42. "https://unfccc.int/process/transparency-and-reporting/"
  43. "reporting-and-review-under-the-convention/"
  44. "greenhouse-gas-inventories-annex-i-parties/"
  45. "submissions/national-inventory-submissions-{}".format(year)
  46. )
  47. # TODO: years before 2019
  48. if int(year) == 2019:
  49. url = (
  50. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  51. "reporting-and-review-under-the-convention/"
  52. "greenhouse-gas-inventories-annex-i-parties/"
  53. "national-inventory-submissions-{}".format(year)
  54. )
  55. if int(year) >= 2020:
  56. url = (
  57. "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
  58. )
  59. download_path = root / "downloaded_data" / "UNFCCC"
  60. error_file_sizes = [212, 210]
  61. # Read submissions list
  62. submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
  63. # filter submissions list or category
  64. items = submissions[submissions.Kind == category.upper()]
  65. # set options for headless mode
  66. profile_path = ".firefox"
  67. options = Options()
  68. #options.add_argument('-headless')
  69. # create profile for headless mode and automatic downloading
  70. options.set_preference('profile', profile_path)
  71. options.set_preference('browser.download.folderList', 2)
  72. # set up selenium driver
  73. driver = Firefox(options=options)
  74. # visit the main data page once to create cookies
  75. driver.get(url)
  76. # wait a bit for the website to load before we get the cokkies
  77. time.sleep(20)
  78. # get the session id cookie
  79. cookies_selenium = driver.get_cookies()
  80. cookies = {}
  81. for cookie in cookies_selenium:
  82. cookies[cookie['name']] = cookie['value']
  83. new_downloaded = []
  84. for idx, submission in items.iterrows():
  85. print("=" * 60)
  86. title = submission.Title
  87. url = submission.URL
  88. country = submission.Country
  89. country = country.replace(' ', '_')
  90. print(f"Downloading {title} from {url}")
  91. country_folder = download_path / country
  92. if not country_folder.exists():
  93. country_folder.mkdir()
  94. local_filename = \
  95. country_folder / dataset / \
  96. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  97. if not local_filename.parent.exists():
  98. local_filename.parent.mkdir()
  99. if local_filename.exists():
  100. # check file size. if 210 or 212 bytes it's the error page
  101. if Path(local_filename).stat().st_size in error_file_sizes:
  102. # found the error page. delete file
  103. os.remove(local_filename)
  104. # now we have removed error pages, so a present file should not be overwritten
  105. if not local_filename.exists():
  106. i = 0 # reset counter
  107. while not local_filename.exists() and i < 10:
  108. # for i = 0 and i = 5 try to get a new session ID
  109. if i == 1 or i == 5:
  110. driver = Firefox(options=options)
  111. # visit the main data page once to create cookies
  112. driver.get(url)
  113. time.sleep(20)
  114. # get the session id cookie
  115. cookies_selenium = driver.get_cookies()
  116. cookies = {}
  117. for cookie in cookies_selenium:
  118. cookies[cookie['name']] = cookie['value']
  119. r = requests.get(url, stream=True, cookies=cookies)
  120. with open(str(local_filename), 'wb') as f:
  121. shutil.copyfileobj(r.raw, f)
  122. # check file size. if 210 or 212 bytes it's the error page
  123. if Path(local_filename).stat().st_size in error_file_sizes:
  124. # found the error page. delete file
  125. os.remove(local_filename)
  126. # sleep a bit to avoid running into captchas
  127. time.sleep(randrange(5, 15))
  128. if local_filename.exists():
  129. new_downloaded.append(submission)
  130. print(f"Download => {local_filename.relative_to(root)}")
  131. # unzip data (only for new downloads)
  132. if local_filename.suffix == ".zip":
  133. try:
  134. zipped_file = zipfile.ZipFile(str(local_filename), 'r')
  135. zipped_file.extractall(str(local_filename.parent))
  136. print(f"Extracted {len(zipped_file.namelist())} files.")
  137. zipped_file.close()
  138. # TODO Better error logging/visibilty
  139. except zipfile.BadZipFile:
  140. print(f"Error while trying to extract {local_filename.relative_to(root)}")
  141. except NotImplementedError:
  142. print("Zip format not supported, please unzip on the command line.")
  143. else:
  144. print(f"Not attempting to extract {local_filename.relative_to(root)}.")
  145. else:
  146. print(f"Failed to download {local_filename.relative_to(root)}")
  147. else:
  148. print(f"=> Already downloaded {local_filename.relative_to(root)}")
  149. driver.close()
  150. df = pd.DataFrame(new_downloaded)
  151. df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)