download_ndc.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import pandas as pd
  2. import requests
  3. import shutil
  4. import time
  5. import os
  6. from datetime import date
  7. from random import randrange
  8. from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
  9. from pathlib import Path
  10. """
  11. based on download_bur from national-inventory-submissions
  12. # (https://github.com/openclimatedata/national-inventory-submisions)
  13. """
  14. ###############
  15. #
  16. # TODO
  17. # download directly via selenium see link below
  18. # https://sqa.stackexchange.com/questions/2197/
  19. # how-to-download-a-file-using-seleniums-webdriver
  20. ###############
  21. # we use the ndc package provided by openclimatedata which is updated on
  22. # a daily basis
  23. submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
  24. submissions = pd.read_csv(submissions_url)
  25. url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
  26. # if we get files of this size they are error pages and we need to
  27. # try the download again
  28. # TODO error page sizes are from BUR and NC and might differ for NDCs
  29. # if an error page is found instead of a pdf adjust sizes here
  30. error_file_sizes = [212, 210]
  31. # Ensure download path and subfolders exist
  32. if not downloaded_data_path_UNFCCC.exists():
  33. downloaded_data_path_UNFCCC.mkdir(parents=True)
  34. new_downloaded = []
  35. for idx, submission in submissions.iterrows():
  36. print("=" * 60)
  37. ndc = submission.Number
  38. title = submission.Title
  39. url = submission.EncodedAbsUrl
  40. submission_date = submission.SubmissionDate
  41. country = submission.Party
  42. country = country.replace(' ', '_')
  43. print(title)
  44. ndc_folder = "NDC_" + ndc + "_" + submission_date
  45. country_folder = downloaded_data_path_UNFCCC / country
  46. if not country_folder.exists():
  47. country_folder.mkdir()
  48. local_filename = country_folder / ndc_folder / url.split('/')[-1]
  49. local_filename_underscore = \
  50. downloaded_data_path_UNFCCC / country / ndc_folder / \
  51. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  52. if not local_filename.parent.exists():
  53. local_filename.parent.mkdir()
  54. # this should never be needed but in case anything goes wrong and
  55. # an error page is present it should be overwritten
  56. if local_filename_underscore.exists():
  57. # check file size. if 210 or 212 bytes it's the error page
  58. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  59. # found the error page. delete file
  60. os.remove(local_filename_underscore)
  61. # now we have to remove error pages, so a present file should not be overwritten
  62. if (not local_filename_underscore.exists()) \
  63. and (not local_filename_underscore.is_symlink()):
  64. i = 0 # reset counter
  65. while not local_filename_underscore.exists() and i < 10:
  66. r = requests.get(url, stream=True)
  67. with open(str(local_filename_underscore), 'wb') as f:
  68. shutil.copyfileobj(r.raw, f)
  69. # check file size. if 210 or 212 bytes it's the error page
  70. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  71. # found the error page. delete file
  72. os.remove(local_filename_underscore)
  73. # sleep a bit to avoid running into captchas
  74. time.sleep(randrange(5, 15))
  75. if local_filename_underscore.exists():
  76. new_downloaded.append(submission)
  77. print("Download => downloaded_data/UNFCCC/" + country + "/" +
  78. ndc_folder + "/" + local_filename_underscore.name)
  79. else:
  80. print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
  81. + ndc_folder + "/" + local_filename_underscore.name)
  82. else:
  83. print("=> Already downloaded " + local_filename_underscore.name)
  84. df = pd.DataFrame(new_downloaded)
  85. df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)