download_ndc.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import pandas as pd
  2. import requests
  3. import shutil
  4. import time
  5. import os
  6. from datetime import date
  7. from random import randrange
  8. from pathlib import Path
  9. root = Path(__file__).parents[2]
  10. """
  11. based on download_bur from national-inventory-submissions
  12. # (https://github.com/openclimatedata/national-inventory-submisions)
  13. """
  14. ###############
  15. #
  16. # TODO
  17. # download directly via selenium see link below
  18. # https://sqa.stackexchange.com/questions/2197/
  19. # how-to-download-a-file-using-seleniums-webdriver
  20. ###############
  21. # we use the ndc package provided by openclimatedata which is updated on
  22. # a daily basis
  23. submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
  24. submissions = pd.read_csv(submissions_url)
  25. url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
  26. # if we get files of this size they are error pages and we need to
  27. # try the download again
  28. # TODO error page sizes are from BUR and NC and might differ for NDCs
  29. # if an error page is found instead of a pdf adjust sizes here
  30. error_file_sizes = [212, 210]
  31. # Ensure download path and subfolders exist
  32. download_path = root / "downloaded_data" / "UNFCCC"
  33. if not download_path.exists():
  34. download_path.mkdir(parents=True)
  35. new_downloaded = []
  36. for idx, submission in submissions.iterrows():
  37. print("=" * 60)
  38. ndc = submission.Number
  39. title = submission.Title
  40. url = submission.EncodedAbsUrl
  41. submission_date = submission.SubmissionDate
  42. country = submission.Party
  43. country = country.replace(' ', '_')
  44. print(title)
  45. ndc_folder = "NDC_" + ndc + "_" + submission_date
  46. country_folder = download_path / country
  47. if not country_folder.exists():
  48. country_folder.mkdir()
  49. local_filename = country_folder / ndc_folder / url.split('/')[-1]
  50. local_filename_underscore = \
  51. download_path / country / ndc_folder / \
  52. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  53. if not local_filename.parent.exists():
  54. local_filename.parent.mkdir()
  55. # this should never be needed but in case anything goes wrong and
  56. # an error page is present it should be overwritten
  57. if local_filename_underscore.exists():
  58. # check file size. if 210 or 212 bytes it's the error page
  59. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  60. # found the error page. delete file
  61. os.remove(local_filename_underscore)
  62. # now we have to remove error pages, so a present file should not be overwritten
  63. if not local_filename_underscore.exists():
  64. i = 0 # reset counter
  65. while not local_filename_underscore.exists() and i < 10:
  66. r = requests.get(url, stream=True)
  67. with open(str(local_filename_underscore), 'wb') as f:
  68. shutil.copyfileobj(r.raw, f)
  69. # check file size. if 210 or 212 bytes it's the error page
  70. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  71. # found the error page. delete file
  72. os.remove(local_filename_underscore)
  73. # sleep a bit to avoid running into captchas
  74. time.sleep(randrange(5, 15))
  75. if local_filename_underscore.exists():
  76. new_downloaded.append(submission)
  77. print("Download => downloaded_data/UNFCCC/" + country + "/" +
  78. ndc_folder + "/" + local_filename_underscore.name)
  79. else:
  80. print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
  81. + ndc_folder + "/" + local_filename_underscore.name)
  82. else:
  83. print("=> Already downloaded " + local_filename_underscore.name)
  84. df = pd.DataFrame(new_downloaded)
  85. df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)