download_ndc.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import pandas as pd
  2. import requests
  3. import shutil
  4. import time
  5. import os
  6. import re
  7. from datetime import date
  8. from random import randrange
  9. from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
  10. from pathlib import Path
  11. """
  12. based on download_bur from national-inventory-submissions
  13. # (https://github.com/openclimatedata/national-inventory-submisions)
  14. """
  15. ###############
  16. #
  17. # TODO
  18. # download directly via selenium see link below
  19. # https://sqa.stackexchange.com/questions/2197/
  20. # how-to-download-a-file-using-seleniums-webdriver
  21. ###############
  22. # we use the ndc package provided by openclimatedata which is updated on
  23. # a daily basis
  24. submissions_url = "https://github.com/openclimatedata/ndcs/raw/main/data/ndcs.csv"
  25. submissions = pd.read_csv(submissions_url)
  26. url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
  27. # if we get files of this size they are error pages and we need to
  28. # try the download again
  29. # TODO error page sizes are from BUR and NC and might differ for NDCs
  30. # if an error page is found instead of a pdf adjust sizes here
  31. error_file_sizes = [212, 210]
  32. ndc_regex = r".*\s([A-Za-z]*)\sNDC"
  33. # Ensure download path and subfolders exist
  34. if not downloaded_data_path_UNFCCC.exists():
  35. downloaded_data_path_UNFCCC.mkdir(parents=True)
  36. new_downloaded = []
  37. for idx, submission in submissions.iterrows():
  38. print("=" * 60)
  39. #ndc = submission.Number
  40. title = submission.Title
  41. temp = re.findall(ndc_regex, title)
  42. ndc = temp[0]
  43. url = submission.EncodedAbsUrl
  44. submission_date = submission.SubmissionDate
  45. country = submission.Party
  46. country = country.replace(' ', '_')
  47. print(title)
  48. ndc_folder = "NDC_" + ndc + "_" + submission_date
  49. country_folder = downloaded_data_path_UNFCCC / country
  50. if not country_folder.exists():
  51. country_folder.mkdir()
  52. local_filename = country_folder / ndc_folder / url.split('/')[-1]
  53. local_filename_underscore = \
  54. downloaded_data_path_UNFCCC / country / ndc_folder / \
  55. url.split('/')[-1].replace("%20", "_").replace(" ", "_")
  56. if not local_filename.parent.exists():
  57. local_filename.parent.mkdir()
  58. # this should never be needed but in case anything goes wrong and
  59. # an error page is present it should be overwritten
  60. if local_filename_underscore.exists():
  61. # check file size. if 210 or 212 bytes it's the error page
  62. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  63. # found the error page. delete file
  64. os.remove(local_filename_underscore)
  65. # now we have to remove error pages, so a present file should not be overwritten
  66. if (not local_filename_underscore.exists()) \
  67. and (not local_filename_underscore.is_symlink()):
  68. i = 0 # reset counter
  69. while not local_filename_underscore.exists() and i < 10:
  70. r = requests.get(url, stream=True)
  71. with open(str(local_filename_underscore), 'wb') as f:
  72. shutil.copyfileobj(r.raw, f)
  73. # check file size. if 210 or 212 bytes it's the error page
  74. if Path(local_filename_underscore).stat().st_size in error_file_sizes:
  75. # found the error page. delete file
  76. os.remove(local_filename_underscore)
  77. # sleep a bit to avoid running into captchas
  78. time.sleep(randrange(5, 15))
  79. if local_filename_underscore.exists():
  80. new_downloaded.append(submission)
  81. print("Download => downloaded_data/UNFCCC/" + country + "/" +
  82. ndc_folder + "/" + local_filename_underscore.name)
  83. else:
  84. print("Failed downloading downloaded_data/UNFCCC/" + country + "/"
  85. + ndc_folder + "/" + local_filename_underscore.name)
  86. else:
  87. print("=> Already downloaded " + local_filename_underscore.name)
  88. df = pd.DataFrame(new_downloaded)
  89. df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)