download.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. """Downloads data from FAOSTAT website."""
  2. import time
  3. import zipfile
  4. from datetime import datetime
  5. import datalad.api
  6. from bs4 import BeautifulSoup
  7. from helper.definitions import downloaded_data_path, root_path
  8. from selenium import webdriver
  9. from selenium.webdriver.chrome.service import Service
  10. class DateTagNotFoundError(Exception):
  11. """
  12. The date when the data set was last updated could not be found
  13. """
  14. def __init__(
  15. self, message="The <p> tag with data-role='date' was not found on the page."
  16. ):
  17. super().__init__(message)
  18. if __name__ == "__main__":
  19. sources = [
  20. (
  21. "farm_gate_emissions_crops",
  22. "https://www.fao.org/faostat/en/#data/GCE",
  23. "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
  24. ),
  25. (
  26. "farm_gate_livestock",
  27. "https://www.fao.org/faostat/en/#data/GLE",
  28. "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
  29. ),
  30. (
  31. "farm_gate_agriculture_energy",
  32. "https://www.fao.org/faostat/en/#data/GN",
  33. "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
  34. ),
  35. (
  36. "land_use_forests",
  37. "https://www.fao.org/faostat/en/#data/GF",
  38. "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
  39. ),
  40. (
  41. "land_use_fires",
  42. "https://www.fao.org/faostat/en/#data/GI",
  43. "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
  44. ),
  45. (
  46. "land_use_drained_organic_soils",
  47. "https://www.fao.org/faostat/en/#data/GV",
  48. "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
  49. ),
  50. (
  51. "pre_post_agricultural_production",
  52. "https://www.fao.org/faostat/en/#data/GPP",
  53. "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
  54. ),
  55. ]
  56. for (
  57. ds_name,
  58. url,
  59. url_download,
  60. ) in sources:
  61. # If the driver isn't found on your system PATH, Selenium
  62. # will automatically download it for you. Make sure there is no
  63. # chromedriver installed on your system
  64. service = Service()
  65. driver = webdriver.Chrome(service=service)
  66. driver.get(url)
  67. # give time to load javascript
  68. time.sleep(3)
  69. html_content = driver.page_source
  70. soup = BeautifulSoup(html_content, "html.parser")
  71. date_tag = soup.find("p", {"data-role": "date"})
  72. if not date_tag:
  73. msg = "The <p> tag with data-role='date' was not found on the page."
  74. raise DateTagNotFoundError(msg)
  75. last_updated = date_tag.get_text()
  76. # make downloaded_data folder if it doesn't exist yet
  77. if not downloaded_data_path.exists():
  78. downloaded_data_path.mkdir()
  79. # make data set folder if it doesn't exist yet
  80. ds_path = downloaded_data_path / ds_name
  81. if not ds_path.exists():
  82. ds_path.mkdir()
  83. # create unique directory
  84. last_updated_iso = datetime.strptime(last_updated, "%B %d, %Y").strftime(
  85. "%Y-%m-%d"
  86. )
  87. local_data_dir = ds_path / last_updated_iso
  88. if not local_data_dir.exists():
  89. local_data_dir.mkdir()
  90. # download and commit with datalad
  91. local_filename = local_data_dir / f"{ds_name}.zip"
  92. datalad.api.download_url(
  93. urls=url_download,
  94. message=f"Added {ds_name}",
  95. path=str(local_filename),
  96. )
  97. if local_filename.exists():
  98. print(f"Download => {local_filename.relative_to(root_path)}")
  99. # unzip data (only for new downloads)
  100. if local_filename.suffix == ".zip":
  101. try:
  102. zipped_file = zipfile.ZipFile(str(local_filename), "r")
  103. zipped_file.extractall(str(local_filename.parent))
  104. print(f"Extracted {len(zipped_file.namelist())} files.")
  105. zipped_file.close()
  106. # os.remove(local_filename)
  107. # TODO Better error logging/visibilty
  108. except zipfile.BadZipFile:
  109. print(
  110. f"Error while trying to extract "
  111. f"{local_filename.relative_to(root_path)}"
  112. )
  113. except NotImplementedError:
  114. print(
  115. "Zip format not supported, " "please unzip on the command line."
  116. )
  117. else:
  118. print(
  119. f"Not attempting to extract "
  120. f"{local_filename.relative_to(root_path)}."
  121. )