download_non-annexI.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. import argparse
  2. import pandas as pd
  3. import requests
  4. import shutil
  5. import time
  6. import os
  7. from datetime import date
  8. from selenium.webdriver import Firefox
  9. from selenium.webdriver.firefox.options import Options
  10. from random import randrange
  11. from pathlib import Path
  12. root = Path(__file__).parents[2]
  13. ###############
  14. #
  15. # TODO
  16. # download directly via selenium see link below
  17. # https://sqa.stackexchange.com/questions/2197/
  18. # how-to-download-a-file-using-seleniums-webdriver
  19. # for automatic downloading see https://stackoverflow.com/questions/70740163/
  20. # python-selenium-firefox-driver-dismiss-open-save-file-popup
  21. ###############
  22. descr = 'Download data from UNFCCC non-AnnexI Submissions. ' \
  23. 'Based on download_bur.py from national-inventory-submissions ' \
  24. '(https://github.com/openclimatedata/national-inventory-submisions)'
  25. parser = argparse.ArgumentParser(description=descr)
  26. parser.add_argument(
  27. '--category',
  28. help='Category to download: BUR, NC'
  29. )
  30. args = parser.parse_args()
  31. category = args.category.upper()
  32. print(f"Downloading {category} submissions")
  33. if category == "BUR":
  34. url = "https://unfccc.int/BURs"
  35. else:
  36. url = "https://unfccc.int/non-annex-I-NCs"
  37. # if we get files of this size they are error pages and we need to
  38. # try the download again
  39. error_file_sizes = [212, 210]
  40. # Read submissions list
  41. download_path = root / "downloaded_data" / "UNFCCC"
  42. submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
  43. # set options for headless mode
  44. profile_path = ".firefox"
  45. options = Options()
  46. #options.add_argument('-headless')
  47. # create profile for headless mode and automatic downloading
  48. options.set_preference('profile', profile_path)
  49. options.set_preference('browser.download.folderList', 2)
  50. # set up selenium driver
  51. driver = Firefox(options=options)
  52. # visit the main data page once to create cookies
  53. driver.get(url)
  54. # wait a bit for the website to load before we get the cookies
  55. time.sleep(20)
  56. # get the session id cookie
  57. cookies_selenium = driver.get_cookies()
  58. cookies = {}
  59. for cookie in cookies_selenium:
  60. cookies[cookie['name']] = cookie['value']
  61. new_downloaded = []
  62. for idx, submission in submissions.iterrows():
  63. print("=" * 60)
  64. kind = submission.Kind
  65. title = submission.Title
  66. url = submission.URL
  67. country = submission.Country
  68. country = country.replace(' ', '_')
  69. print(f"Downloading {title} from {url}")
  70. country_folder = download_path / country
  71. if not country_folder.exists():
  72. country_folder.mkdir()
  73. local_filename = \
  74. country_folder / kind / \
  75. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  76. if not local_filename.parent.exists():
  77. local_filename.parent.mkdir()
  78. if local_filename.exists():
  79. # check file size. if 210 or 212 bytes it's the error page
  80. if Path(local_filename).stat().st_size in error_file_sizes:
  81. # found the error page. delete file
  82. os.remove(local_filename)
  83. # now we have removed error pages, so a present file should not be overwritten
  84. if not local_filename.exists():
  85. i = 0 # reset counter
  86. while not local_filename.exists() and i < 10:
  87. # for i = 0 and i = 5 try to get a new session ID
  88. if i == 1 or i == 5:
  89. driver = Firefox(options=options)
  90. # visit the main data page once to create cookies
  91. driver.get(url)
  92. time.sleep(20)
  93. # get the session id cookie
  94. cookies_selenium = driver.get_cookies()
  95. cookies = {}
  96. for cookie in cookies_selenium:
  97. cookies[cookie['name']] = cookie['value']
  98. r = requests.get(url, stream=True, cookies=cookies)
  99. with open(str(local_filename), 'wb') as f:
  100. shutil.copyfileobj(r.raw, f)
  101. # check file size. if 210 or 212 bytes it's the error page
  102. if Path(local_filename).stat().st_size in error_file_sizes:
  103. # found the error page. delete file
  104. os.remove(local_filename)
  105. # sleep a bit to avoid running into captchas
  106. time.sleep(randrange(5, 15))
  107. if local_filename.exists():
  108. new_downloaded.append(submission)
  109. print(f"Download => {local_filename.relative_to(root)}")
  110. else:
  111. print(f"Failed to download {local_filename.relative_to(root)}")
  112. else:
  113. print(f"=> Already downloaded {local_filename.relative_to(root)}")
  114. driver.close()
  115. df = pd.DataFrame(new_downloaded)
  116. df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)