download_annexI.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. import zipfile
  8. from datetime import date
  9. from selenium.webdriver import Firefox
  10. from selenium.webdriver.firefox.options import Options
  11. from random import randrange
  12. from pathlib import Path
  13. root = Path(__file__).parents[2]
  14. ###############
  15. #
  16. # TODO
  17. # download directly via selenium see link below
  18. # https://sqa.stackexchange.com/questions/2197/
  19. # how-to-download-a-file-using-seleniums-webdriver
  20. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  21. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  22. ###############
  23. descr = 'Download and unzip data from UNFCCC National Inventory Submissions. ' \
  24. 'Based on download.py from national-inventory-submissions ' \
  25. '(https://github.com/openclimatedata/national-inventory-submisions)'
  26. parser = argparse.ArgumentParser(description=descr)
  27. parser.add_argument(
  28. '--category',
  29. help='Category to download, CRF, NIR, SEF'
  30. )
  31. parser.add_argument(
  32. '--year',
  33. help='Year to download'
  34. )
  35. args = parser.parse_args()
  36. year = args.year
  37. category = args.category.upper()
  38. dataset = category + year
  39. print(f"Downloading data for {dataset}")
  40. # generate the correct url
  41. url = (
  42. "https://unfccc.int/process/transparency-and-reporting/"
  43. "reporting-and-review-under-the-convention/"
  44. "greenhouse-gas-inventories-annex-i-parties/"
  45. "submissions/national-inventory-submissions-{}".format(year)
  46. )
  47. # TODO: move to utils as used in two places
  48. if int(year) == 2019:
  49. url = (
  50. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  51. "reporting-and-review-under-the-convention/"
  52. "greenhouse-gas-inventories-annex-i-parties/"
  53. "national-inventory-submissions-{}".format(year)
  54. )
  55. elif int(year) in range(2020,2023):
  56. url = (
  57. "https://unfccc.int/ghg-inventories-annex-i-parties/{}".format(year)
  58. )
  59. elif int(year) >= 2023:
  60. url = (
  61. "https://unfccc.int/process-and-meetings/transparency-and-reporting/"
  62. "reporting-and-review-under-the-convention/"
  63. "greenhouse-gas-inventories-annex-i-parties/"
  64. "national-inventory-submissions-{}".format(year)
  65. )
  66. else:
  67. url = (
  68. "https://unfccc.int/process/transparency-and-reporting/"
  69. "reporting-and-review-under-the-convention/"
  70. "greenhouse-gas-inventories-annex-i-parties/"
  71. "submissions/national-inventory-submissions-{}".format(year)
  72. )
  73. download_path = root / "downloaded_data" / "UNFCCC"
  74. error_file_sizes = [212, 210]
  75. # Read submissions list
  76. submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
  77. # filter submissions list or category
  78. items = submissions[submissions.Kind == category.upper()]
  79. # set options for headless mode
  80. profile_path = ".firefox"
  81. options = Options()
  82. #options.add_argument('-headless')
  83. # create profile for headless mode and automatic downloading
  84. options.set_preference('profile', profile_path)
  85. options.set_preference('browser.download.folderList', 2)
  86. # set up selenium driver
  87. driver = Firefox(options=options)
  88. # visit the main data page once to create cookies
  89. driver.get(url)
  90. # wait a bit for the website to load before we get the cokkies
  91. time.sleep(20)
  92. # get the session id cookie
  93. cookies_selenium = driver.get_cookies()
  94. cookies = {}
  95. for cookie in cookies_selenium:
  96. cookies[cookie['name']] = cookie['value']
  97. new_downloaded = []
  98. for idx, submission in items.iterrows():
  99. print("=" * 60)
  100. title = submission.Title
  101. url = submission.URL
  102. country = submission.Country
  103. country = country.replace(' ', '_')
  104. print(f"Downloading {title} from {url}")
  105. country_folder = download_path / country
  106. if not country_folder.exists():
  107. country_folder.mkdir()
  108. local_filename = \
  109. country_folder / dataset / \
  110. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  111. if not local_filename.parent.exists():
  112. local_filename.parent.mkdir()
  113. if local_filename.exists():
  114. # check file size. if 210 or 212 bytes it's the error page
  115. if Path(local_filename).stat().st_size in error_file_sizes:
  116. # found the error page. delete file
  117. os.remove(local_filename)
  118. # now we have removed error pages, so a present file should not be overwritten
  119. if not local_filename.exists():
  120. i = 0 # reset counter
  121. while not local_filename.exists() and i < 10:
  122. # for i = 0 and i = 5 try to get a new session ID
  123. if i == 1 or i == 5:
  124. driver = Firefox(options=options)
  125. # visit the main data page once to create cookies
  126. driver.get(url)
  127. time.sleep(20)
  128. # get the session id cookie
  129. cookies_selenium = driver.get_cookies()
  130. cookies = {}
  131. for cookie in cookies_selenium:
  132. cookies[cookie['name']] = cookie['value']
  133. r = requests.get(url, stream=True, cookies=cookies)
  134. with open(str(local_filename), 'wb') as f:
  135. shutil.copyfileobj(r.raw, f)
  136. # check file size. if 210 or 212 bytes it's the error page
  137. if Path(local_filename).stat().st_size in error_file_sizes:
  138. # found the error page. delete file
  139. os.remove(local_filename)
  140. # sleep a bit to avoid running into captchas
  141. time.sleep(randrange(5, 15))
  142. if local_filename.exists():
  143. new_downloaded.append(submission)
  144. print(f"Download => {local_filename.relative_to(root)}")
  145. # unzip data (only for new downloads)
  146. if local_filename.suffix == ".zip":
  147. try:
  148. zipped_file = zipfile.ZipFile(str(local_filename), 'r')
  149. zipped_file.extractall(str(local_filename.parent))
  150. print(f"Extracted {len(zipped_file.namelist())} files.")
  151. zipped_file.close()
  152. # TODO Better error logging/visibilty
  153. except zipfile.BadZipFile:
  154. print(f"Error while trying to extract {local_filename.relative_to(root)}")
  155. except NotImplementedError:
  156. print("Zip format not supported, please unzip on the command line.")
  157. else:
  158. print(f"Not attempting to extract {local_filename.relative_to(root)}.")
  159. else:
  160. print(f"Failed to download {local_filename.relative_to(root)}")
  161. else:
  162. print(f"=> Already downloaded {local_filename.relative_to(root)}")
  163. driver.close()
  164. df = pd.DataFrame(new_downloaded)
  165. df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)