download_btr.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. import zipfile
  8. from datetime import date
  9. from selenium.webdriver import Firefox
  10. from selenium.webdriver.firefox.options import Options
  11. from random import randrange
  12. from pathlib import Path
  13. from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
  14. from unfccc_submission_info import get_BTR_name_and_URL
  15. ###############
  16. #
  17. # TODO
  18. # download directly via selenium see link below
  19. # https://sqa.stackexchange.com/questions/2197/
  20. # how-to-download-a-file-using-seleniums-webdriver
  21. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  22. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  23. ###############
  24. descr = 'Download and unzip data from UNFCCC Biannial Transparency Reports Submissions. ' \
  25. 'Based on download.py from national-inventory-submissions ' \
  26. '(https://github.com/openclimatedata/national-inventory-submisions)'
  27. parser = argparse.ArgumentParser(description=descr)
  28. parser.add_argument(
  29. '--round',
  30. help='Submission round to download, e.g. 1'
  31. )
  32. args = parser.parse_args()
  33. submission_round = int(args.round)
  34. round_name, url = get_BTR_name_and_URL(submission_round)
  35. dataset = f"BTR{submission_round}"
  36. print(f"Downloading data for {round_name} BTRs")
  37. error_file_sizes = [212, 210]
  38. # Read submissions list
  39. submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{dataset}.csv")
  40. # set options for headless mode
  41. profile_path = ".firefox"
  42. options = Options()
  43. #options.add_argument('-headless')
  44. # create profile for headless mode and automatic downloading
  45. options.set_preference('profile', profile_path)
  46. options.set_preference('browser.download.folderList', 2)
  47. # set up selenium driver
  48. driver = Firefox(options=options)
  49. # visit the main data page once to create cookies
  50. driver.get(url)
  51. # wait a bit for the website to load before we get the cookies
  52. time.sleep(20)
  53. # get the session id cookie
  54. cookies_selenium = driver.get_cookies()
  55. cookies = {}
  56. for cookie in cookies_selenium:
  57. cookies[cookie['name']] = cookie['value']
  58. new_downloaded = []
  59. for idx, submission in submissions.iterrows():
  60. print("=" * 60)
  61. title = submission.Title
  62. url = submission.URL
  63. country = submission.Country
  64. country = country.replace(' ', '_')
  65. print(f"Downloading {title} from {url}")
  66. country_folder = downloaded_data_path_UNFCCC / country
  67. if not country_folder.exists():
  68. country_folder.mkdir()
  69. local_filename = \
  70. country_folder / dataset / \
  71. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  72. if not local_filename.parent.exists():
  73. local_filename.parent.mkdir()
  74. if local_filename.exists():
  75. # check file size. if 210 or 212 bytes it's the error page
  76. if Path(local_filename).stat().st_size in error_file_sizes:
  77. # found the error page. delete file
  78. os.remove(local_filename)
  79. # now we have removed error pages, so a present file should not be overwritten
  80. if (not local_filename.exists()) and (not local_filename.is_symlink()):
  81. i = 0 # reset counter
  82. while not local_filename.exists() and i < 10:
  83. # for i = 0 and i = 5 try to get a new session ID
  84. if i == 1 or i == 5:
  85. driver = Firefox(options=options)
  86. # visit the main data page once to create cookies
  87. driver.get(url)
  88. time.sleep(20)
  89. # get the session id cookie
  90. cookies_selenium = driver.get_cookies()
  91. cookies = {}
  92. for cookie in cookies_selenium:
  93. cookies[cookie['name']] = cookie['value']
  94. r = requests.get(url, stream=True, cookies=cookies)
  95. with open(str(local_filename), 'wb') as f:
  96. shutil.copyfileobj(r.raw, f)
  97. # check file size. if 210 or 212 bytes it's the error page
  98. if Path(local_filename).stat().st_size in error_file_sizes:
  99. # found the error page. delete file
  100. os.remove(local_filename)
  101. # sleep a bit to avoid running into captchas
  102. time.sleep(randrange(5, 15))
  103. if local_filename.exists():
  104. new_downloaded.append(submission)
  105. print(f"Download => {local_filename.relative_to(root_path)}")
  106. # unzip data (only for new downloads)
  107. if local_filename.suffix == ".zip":
  108. try:
  109. zipped_file = zipfile.ZipFile(str(local_filename), 'r')
  110. zipped_file.extractall(str(local_filename.parent))
  111. print(f"Extracted {len(zipped_file.namelist())} files.")
  112. zipped_file.close()
  113. # TODO Better error logging/visibilty
  114. except zipfile.BadZipFile:
  115. print(f"Error while trying to extract "
  116. f"{local_filename.relative_to(root_path)}")
  117. except NotImplementedError:
  118. print("Zip format not supported, please unzip on the command line.")
  119. else:
  120. print(f"Not attempting to extract "
  121. f"{local_filename.relative_to(root_path)}.")
  122. else:
  123. print(f"Failed to download {local_filename.relative_to(root_path)}")
  124. else:
  125. print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
  126. driver.close()
  127. df = pd.DataFrame(new_downloaded)
  128. df.to_csv(downloaded_data_path_UNFCCC
  129. / f"00_new_downloads_{dataset}-{date.today()}.csv", index=False)