download_bur.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import pandas as pd
  2. import requests
  3. import shutil
  4. import time
  5. import os
  6. from datetime import date
  7. from selenium import webdriver
  8. from random import randrange
  9. from pathlib import Path
  10. root = Path(__file__).parents[2]
  11. """
  12. based on download_bur from national-inventory-submissions
  13. # (https://github.com/openclimatedata/national-inventory-submisions)
  14. """
  15. ###############
  16. #
  17. # TODO
  18. # download directly via selenium see link below
  19. # https://sqa.stackexchange.com/questions/2197/
  20. # how-to-download-a-file-using-seleniums-webdriver
  21. ###############
  22. submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" /
  23. "submissions-bur.csv")
  24. url = "https://unfccc.int/BURs"
  25. # if we get files of this size they are error pages and we need to
  26. # try the download again
  27. error_file_sizes = [212, 210]
  28. # find which BUR submission rounds exist
  29. present_BURs = submissions.Kind.unique()
  30. # Ensure download path and subfolders exist
  31. download_path = root / "downloaded_data/UNFCCC"
  32. if not download_path.exists():
  33. download_path.mkdir(parents=True)
  34. # set options for headless mode
  35. options = webdriver.firefox.options.Options()
  36. # options.add_argument('-headless')
  37. # create profile for headless mode
  38. profile = webdriver.FirefoxProfile()
  39. profile.set_preference('browser.download.folderList', 2)
  40. # set up selenium driver
  41. driver = webdriver.Firefox(options=options, firefox_profile=profile)
  42. # visit the main data page once to create cookies
  43. driver.get(url)
  44. time.sleep(20)
  45. # get the session id cookie
  46. cookies_selenium = driver.get_cookies()
  47. cookies = {}
  48. for cookie in cookies_selenium:
  49. cookies[cookie['name']] = cookie['value']
  50. print(cookies)
  51. new_downloaded = []
  52. for idx, submission in submissions.iterrows():
  53. print("=" * 60)
  54. bur = submission.Kind
  55. title = submission.Title
  56. url = submission.URL
  57. country = submission.Country
  58. country = country.replace(' ', '_')
  59. print(title)
  60. country_folder = download_path / country
  61. if not country_folder.exists():
  62. country_folder.mkdir()
  63. local_filename = country_folder / bur / url.split('/')[-1]
  64. local_filename_underscore = \
  65. download_path / country / bur / \
  66. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  67. if not local_filename.parent.exists():
  68. local_filename.parent.mkdir()
  69. ### remove, not needed as no legacy data present
  70. #if local_filename.exists():
  71. # # rename
  72. # local_filename.rename(local_filename_underscore)
  73. # print("Renamed " + bur + "/" + country + "/" + local_filename.name)
  74. # this should never be needed but in case anything goes wrong and
  75. # an error page is present it should be overwritten
  76. if local_filename_underscore.exists():
  77. # check file size. if 210 or 212 bytes it's the error page
  78. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  79. # found the error page. delete file
  80. os.remove(local_filename_underscore)
  81. # now we have remove error pages, so a present file should not be overwritten
  82. if not local_filename_underscore.exists():
  83. i = 0 # reset counter
  84. while not local_filename_underscore.exists() and i < 10:
  85. # for i = 0 and i = 5 try to get a new session ID
  86. if i == 1 or i == 5:
  87. driver = webdriver.Firefox(options=options,
  88. firefox_profile=profile)
  89. # visit the main data page once to create cookies
  90. driver.get(url)
  91. time.sleep(20)
  92. # get the session id cookie
  93. cookies_selenium = driver.get_cookies()
  94. cookies = {}
  95. for cookie in cookies_selenium:
  96. cookies[cookie['name']] = cookie['value']
  97. r = requests.get(url, stream=True, cookies=cookies)
  98. with open(str(local_filename_underscore), 'wb') as f:
  99. shutil.copyfileobj(r.raw, f)
  100. # check file size. if 210 or 212 bytes it's the error page
  101. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  102. # found the error page. delete file
  103. os.remove(local_filename_underscore)
  104. # sleep a bit to avoid running into captchas
  105. time.sleep(randrange(5, 15))
  106. if local_filename_underscore.exists():
  107. new_downloaded.append(submission)
  108. print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
  109. "/" + local_filename_underscore.name)
  110. else:
  111. print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
  112. + bur + "/" + local_filename_underscore.name)
  113. else:
  114. print("=> Already downloaded " + local_filename_underscore.name)
  115. driver.close()
  116. df = pd.DataFrame(new_downloaded)
  117. df.to_csv(download_path / "00_new_downloads_bur-{}.csv".format(date.today()), index=False)