download_bur.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import pandas as pd
  2. import requests
  3. import shutil
  4. import time
  5. import os
  6. from datetime import date
  7. from selenium import webdriver
  8. from random import randrange
  9. from pathlib import Path
  10. root = Path(__file__).parents[2]
  11. """
  12. based on download_bur from national-inventory-submissions
  13. # (https://github.com/openclimatedata/national-inventory-submisions)
  14. """
  15. ###############
  16. #
  17. # TODO
  18. # download directly via selenium see link below
  19. # https://sqa.stackexchange.com/questions/2197/
  20. # how-to-download-a-file-using-seleniums-webdriver
  21. ###############
  22. submissions = pd.read_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv")
  23. # use the CRF data url, for some reason visnting the BUR url
  24. # is not enough to generate the necessary cookies
  25. url = "https://unfccc.int/BURs"
  26. # if we get files of this size they are error pages and we need to
  27. # try the download again
  28. error_file_sizes = [212, 210]
  29. # find which BUR submission rounds exist
  30. present_BURs = submissions.Kind.unique()
  31. # Ensure download path and subfolders exist
  32. download_path = root / "downloaded_data/UNFCCC"
  33. if not download_path.exists():
  34. download_path.mkdir(parents=True)
  35. for BUR in present_BURs:
  36. download_path_BUR = download / BUR
  37. if not download_path_BUR.exists():
  38. download_path_BUR.mkdir(parents=True)
  39. # set options for headless mode
  40. options = webdriver.firefox.options.Options()
  41. # options.add_argument('-headless')
  42. # create profile for headless mode
  43. profile = webdriver.FirefoxProfile()
  44. profile.set_preference('browser.download.folderList', 2)
  45. # set up selenium driver
  46. driver = webdriver.Firefox(options=options, firefox_profile=profile)
  47. # visit the main data page once to create cookies
  48. driver.get(url)
  49. time.sleep(20)
  50. # get the session id cookie
  51. cookies_selenium = driver.get_cookies()
  52. cookies = {}
  53. for cookie in cookies_selenium:
  54. cookies[cookie['name']] = cookie['value']
  55. print(cookies)
  56. new_downloaded = []
  57. for idx, submission in submissions.iterrows():
  58. print("=" * 60)
  59. bur = submission.Kind
  60. title = submission.Title
  61. url = submission.URL
  62. country = submission.Country
  63. country = country.replace(' ', '_')
  64. print(title)
  65. local_filename = download_path / country / bur / url.split('/')[-1]
  66. local_filename_underscore = \
  67. download_path / country / bur / \
  68. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  69. if not local_filename.parent.exists():
  70. local_filename.parent.mkdir()
  71. ### remove, not needed as no legacy data present
  72. #if local_filename.exists():
  73. # # rename
  74. # local_filename.rename(local_filename_underscore)
  75. # print("Renamed " + bur + "/" + country + "/" + local_filename.name)
  76. # this should never be needed but in case anything goes wrong and
  77. # an error page is present it should be overwritten
  78. if local_filename_underscore.exists():
  79. # check file size. if 210 or 212 bytes it's the error page
  80. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  81. # found the error page. delete file
  82. os.remove(local_filename_underscore)
  83. # now we have remove error pages, so a present file should not be overwritten
  84. if not local_filename_underscore.exists():
  85. i = 0 # reset counter
  86. while not local_filename_underscore.exists() and i < 10:
  87. # for i = 0 and i = 5 try to get a new session ID
  88. if i == 1 or i == 5:
  89. driver = webdriver.Firefox(options=options, firefox_profile=profile)
  90. # visit the main data page once to create cookies
  91. driver.get(url)
  92. time.sleep(20)
  93. # get the session id cookie
  94. cookies_selenium = driver.get_cookies()
  95. cookies = {}
  96. for cookie in cookies_selenium:
  97. cookies[cookie['name']] = cookie['value']
  98. r = requests.get(url, stream=True, cookies = cookies)
  99. with open(str(local_filename_underscore), 'wb') as f:
  100. shutil.copyfileobj(r.raw, f)
  101. # check file size. if 210 or 212 bytes it's the error page
  102. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  103. # found the error page. delete file
  104. os.remove(local_filename_underscore)
  105. # sleep a bit to avoid running into captchas
  106. time.sleep(randrange(5, 15))
  107. if local_filename_underscore.exists():
  108. new_downloaded.append(submission)
  109. print("Download => downloaded_data/UNFCCC/" + country + "/" + bur +
  110. "/" + local_filename_underscore.name)
  111. else:
  112. print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
  113. + bur + "/" + local_filename_underscore.name)
  114. else:
  115. print("=> Already downloaded " + local_filename_underscore.name)
  116. driver.close()
  117. df = pd.DataFrame(new_downloaded)
  118. df.to_csv(download_path / "00_new_downloads-{}.csv".format(date.today()), index=False)