download_annexI.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. import zipfile
  8. from datetime import date
  9. from selenium.webdriver import Firefox
  10. from selenium.webdriver.firefox.options import Options
  11. from random import randrange
  12. from pathlib import Path
  13. from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
  14. ###############
  15. #
  16. # TODO
  17. # download directly via selenium see link below
  18. # https://sqa.stackexchange.com/questions/2197/
  19. # how-to-download-a-file-using-seleniums-webdriver
  20. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  21. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  22. ###############
  23. descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
  24. 'Based on download.py from national-inventory-submissions ' \
  25. '(https://github.com/openclimatedata/national-inventory-submisions)'
  26. parser = argparse.ArgumentParser(description=descr)
  27. parser.add_argument(
  28. '--category',
  29. help='Category to download, CRF, NIR, SEF'
  30. )
  31. parser.add_argument(
  32. '--year',
  33. help='Year to download'
  34. )
  35. args = parser.parse_args()
  36. year = args.year
  37. category = args.category.upper()
  38. dataset = category + year
  39. print(f"Downloading data for {dataset}")
  40. # generate the correct url
  41. url = (
  42. "https://unfccc.int/process/transparency-and-reporting/"
  43. "reporting-and-review-under-the-convention/"
  44. "greenhouse-gas-inventories-annex-i-parties/"
  45. "submissions/national-inventory-submissions-{}".format(year)
  46. )
  47. # TODO: move to utils as used in two places
  48. if int(year) == 2019:
  49. url = (
  50. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  51. "reporting-and-review-under-the-convention/"
  52. "greenhouse-gas-inventories-annex-i-parties/"
  53. "national-inventory-submissions-{}".format(year)
  54. )
  55. elif int(year) in range(2020,2023):
  56. url = (
  57. "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
  58. )
  59. elif int(year) >= 2023:
  60. url = (
  61. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  62. "reporting-and-review-under-the-convention/"
  63. "greenhouse-gas-inventories-annex-i-parties/"
  64. "national-inventory-submissions-{}".format(year)
  65. )
  66. else:
  67. url = (
  68. "https://unfccc.int/process/transparency-and-reporting/"
  69. "reporting-and-review-under-the-convention/"
  70. "greenhouse-gas-inventories-annex-i-parties/"
  71. "submissions/national-inventory-submissions-{}".format(year)
  72. )
  73. error_file_sizes = [212, 210]
  74. # Read submissions list
  75. submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")
  76. # filter submissions list or category
  77. items = submissions[submissions.Kind == category.upper()]
  78. # set options for headless mode
  79. profile_path = ".firefox"
  80. options = Options()
  81. #options.add_argument('-headless')
  82. # create profile for headless mode and automatic downloading
  83. options.set_preference('profile', profile_path)
  84. options.set_preference('browser.download.folderList', 2)
  85. # set up selenium driver
  86. driver = Firefox(options=options)
  87. # visit the main data page once to create cookies
  88. driver.get(url)
  89. # wait a bit for the website to load before we get the cokkies
  90. time.sleep(20)
  91. # get the session id cookie
  92. cookies_selenium = driver.get_cookies()
  93. cookies = {}
  94. for cookie in cookies_selenium:
  95. cookies[cookie['name']] = cookie['value']
  96. new_downloaded = []
  97. for idx, submission in items.iterrows():
  98. print("=" * 60)
  99. title = submission.Title
  100. url = submission.URL
  101. country = submission.Country
  102. country = country.replace(' ', '_')
  103. print(f"Downloading {title} from {url}")
  104. country_folder = downloaded_data_path_UNFCCC / country
  105. if not country_folder.exists():
  106. country_folder.mkdir()
  107. local_filename = \
  108. country_folder / dataset / \
  109. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  110. if not local_filename.parent.exists():
  111. local_filename.parent.mkdir()
  112. if local_filename.exists():
  113. # check file size. if 210 or 212 bytes it's the error page
  114. if Path(local_filename).stat().st_size in error_file_sizes:
  115. # found the error page. delete file
  116. os.remove(local_filename)
  117. # now we have removed error pages, so a present file should not be overwritten
  118. if (not local_filename.exists()) and (not local_filename.is_symlink()):
  119. i = 0 # reset counter
  120. while not local_filename.exists() and i < 10:
  121. # for i = 0 and i = 5 try to get a new session ID
  122. if i == 1 or i == 5:
  123. driver = Firefox(options=options)
  124. # visit the main data page once to create cookies
  125. driver.get(url)
  126. time.sleep(20)
  127. # get the session id cookie
  128. cookies_selenium = driver.get_cookies()
  129. cookies = {}
  130. for cookie in cookies_selenium:
  131. cookies[cookie['name']] = cookie['value']
  132. r = requests.get(url, stream=True, cookies=cookies)
  133. with open(str(local_filename), 'wb') as f:
  134. shutil.copyfileobj(r.raw, f)
  135. # check file size. if 210 or 212 bytes it's the error page
  136. if Path(local_filename).stat().st_size in error_file_sizes:
  137. # found the error page. delete file
  138. os.remove(local_filename)
  139. # sleep a bit to avoid running into captchas
  140. time.sleep(randrange(5, 15))
  141. if local_filename.exists():
  142. new_downloaded.append(submission)
  143. print(f"Download => {local_filename.relative_to(root_path)}")
  144. # unzip data (only for new downloads)
  145. if local_filename.suffix == ".zip":
  146. try:
  147. zipped_file = zipfile.ZipFile(str(local_filename), 'r')
  148. zipped_file.extractall(str(local_filename.parent))
  149. print(f"Extracted {len(zipped_file.namelist())} files.")
  150. zipped_file.close()
  151. # TODO Better error logging/visibilty
  152. except zipfile.BadZipFile:
  153. print(f"Error while trying to extract "
  154. f"{local_filename.relative_to(root_path)}")
  155. except NotImplementedError:
  156. print("Zip format not supported, please unzip on the command line.")
  157. else:
  158. print(f"Not attempting to extract "
  159. f"{local_filename.relative_to(root_path)}.")
  160. else:
  161. print(f"Failed to download {local_filename.relative_to(root_path)}")
  162. else:
  163. print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
  164. driver.close()
  165. df = pd.DataFrame(new_downloaded)
  166. df.to_csv(downloaded_data_path_UNFCCC
  167. / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)