download_non-annexI.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. from datetime import date
  8. from selenium.webdriver import Firefox
  9. from selenium.webdriver.firefox.options import Options
  10. from random import randrange
  11. from pathlib import Path
  12. from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
  13. ###############
  14. #
  15. # TODO
  16. # download directly via selenium see link below
  17. # https://sqa.stackexchange.com/questions/2197/
  18. # how-to-download-a-file-using-seleniums-webdriver
  19. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  20. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  21. ###############
  22. descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
  23. 'Based on download_bur.py from national-inventory-submissions ' \
  24. '(https://github.com/openclimatedata/national-inventory-submisions)'
  25. parser = argparse.ArgumentParser(description=descr)
  26. parser.add_argument(
  27. '--category',
  28. help='Category to download: BUR, NC'
  29. )
  30. args = parser.parse_args()
  31. category = args.category.upper()
  32. print(f"Downloading {category} submissions")
  33. if category == "BUR":
  34. url = "https://unfccc.int/BURs"
  35. else:
  36. url = "https://unfccc.int/non-annex-I-NCs"
  37. # if we get files of this size they are error pages and we need to
  38. # try the download again
  39. error_file_sizes = [212, 210]
  40. # Read submissions list
  41. submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{category.lower()}.csv")
  42. # set options for headless mode
  43. profile_path = ".firefox"
  44. options = Options()
  45. #options.add_argument('-headless')
  46. # create profile for headless mode and automatic downloading
  47. options.set_preference('profile', profile_path)
  48. options.set_preference('browser.download.folderList', 2)
  49. # set up selenium driver
  50. driver = Firefox(options=options)
  51. # visit the main data page once to create cookies
  52. driver.get(url)
  53. # wait a bit for the website to load before we get the cookies
  54. time.sleep(20)
  55. # get the session id cookie
  56. cookies_selenium = driver.get_cookies()
  57. cookies = {}
  58. for cookie in cookies_selenium:
  59. cookies[cookie['name']] = cookie['value']
  60. new_downloaded = []
  61. for idx, submission in submissions.iterrows():
  62. print("=" * 60)
  63. kind = submission.Kind
  64. title = submission.Title
  65. url = submission.URL
  66. country = submission.Country
  67. country = country.replace(' ', '_')
  68. print(f"Downloading {title} from {url}")
  69. country_folder = downloaded_data_path_UNFCCC / country
  70. if not country_folder.exists():
  71. country_folder.mkdir()
  72. local_filename = \
  73. country_folder / kind / \
  74. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  75. if not local_filename.parent.exists():
  76. local_filename.parent.mkdir()
  77. if local_filename.exists():
  78. # check file size. if 210 or 212 bytes it's the error page
  79. if Path(local_filename).stat().st_size in error_file_sizes:
  80. # found the error page. delete file
  81. os.remove(local_filename)
  82. # now we have removed error pages, so a present file should not be overwritten
  83. if (not local_filename.exists()) and (not local_filename.is_symlink()):
  84. i = 0 # reset counter
  85. while not local_filename.exists() and i < 10:
  86. # for i = 0 and i = 5 try to get a new session ID
  87. if i == 1 or i == 5:
  88. driver = Firefox(options=options)
  89. # visit the main data page once to create cookies
  90. driver.get(url)
  91. time.sleep(20)
  92. # get the session id cookie
  93. cookies_selenium = driver.get_cookies()
  94. cookies = {}
  95. for cookie in cookies_selenium:
  96. cookies[cookie['name']] = cookie['value']
  97. r = requests.get(url, stream=True, cookies=cookies)
  98. with open(str(local_filename), 'wb') as f:
  99. shutil.copyfileobj(r.raw, f)
  100. # check file size. if 210 or 212 bytes it's the error page
  101. if Path(local_filename).stat().st_size in error_file_sizes:
  102. # found the error page. delete file
  103. os.remove(local_filename)
  104. # sleep a bit to avoid running into captchas
  105. time.sleep(randrange(5, 15))
  106. if local_filename.exists():
  107. new_downloaded.append(submission)
  108. print(f"Download => {local_filename.relative_to(root_path)}")
  109. else:
  110. print(f"Failed to download {local_filename.relative_to(root_path)}")
  111. else:
  112. print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
  113. driver.close()
  114. df = pd.DataFrame(new_downloaded)
  115. df.to_csv(downloaded_data_path_UNFCCC /
  116. f"00_new_downloads_{category}-{date.today()}.csv", index=False)