download_all_domains.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """Downloads all domain data sets from FAOSTAT website."""
  2. import zipfile
  3. import requests
  4. from src.faostat_data_primap.download import get_html_content, get_last_updated_date
  5. from src.faostat_data_primap.helper.definitions import downloaded_data_path, root_path
  6. if __name__ == "__main__":
  7. sources = [
  8. (
  9. "farm_gate_emissions_crops",
  10. "https://www.fao.org/faostat/en/#data/GCE",
  11. "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
  12. ),
  13. (
  14. "farm_gate_livestock",
  15. "https://www.fao.org/faostat/en/#data/GLE",
  16. "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
  17. ),
  18. (
  19. "farm_gate_agriculture_energy",
  20. "https://www.fao.org/faostat/en/#data/GN",
  21. "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
  22. ),
  23. (
  24. "land_use_forests",
  25. "https://www.fao.org/faostat/en/#data/GF",
  26. "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
  27. ),
  28. (
  29. "land_use_fires",
  30. "https://www.fao.org/faostat/en/#data/GI",
  31. "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
  32. ),
  33. (
  34. "land_use_drained_organic_soils",
  35. "https://www.fao.org/faostat/en/#data/GV",
  36. "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
  37. ),
  38. (
  39. "pre_post_agricultural_production",
  40. "https://www.fao.org/faostat/en/#data/GPP",
  41. "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
  42. ),
  43. ]
  44. for (
  45. ds_name,
  46. url,
  47. url_download,
  48. ) in sources:
  49. soup = get_html_content(url)
  50. last_updated = get_last_updated_date(soup, url)
  51. if not downloaded_data_path.exists():
  52. downloaded_data_path.mkdir()
  53. ds_path = downloaded_data_path / ds_name
  54. if not ds_path.exists():
  55. ds_path.mkdir()
  56. local_data_dir = ds_path / last_updated
  57. if not local_data_dir.exists():
  58. local_data_dir.mkdir()
  59. local_filename = local_data_dir / f"{ds_name}.zip"
  60. response = requests.get(url_download, timeout=20)
  61. response.raise_for_status()
  62. # will overwrite existing file
  63. with open(local_filename, mode="wb") as file:
  64. file.write(response.content)
  65. if local_filename.exists():
  66. print(f"Download => {local_filename.relative_to(root_path)}")
  67. # unzip data (only for new downloads)
  68. if local_filename.suffix == ".zip":
  69. try:
  70. zipped_file = zipfile.ZipFile(str(local_filename), "r")
  71. zipped_file.extractall(str(local_filename.parent))
  72. print(f"Extracted {len(zipped_file.namelist())} files.")
  73. zipped_file.close()
  74. # os.remove(local_filename)
  75. # TODO Better error logging/visibilty
  76. except zipfile.BadZipFile:
  77. print(
  78. f"Error while trying to extract "
  79. f"{local_filename.relative_to(root_path)}"
  80. )
  81. except NotImplementedError:
  82. print(
  83. "Zip format not supported, " "please unzip on the command line."
  84. )
  85. else:
  86. print(
  87. f"Not attempting to extract "
  88. f"{local_filename.relative_to(root_path)}."
  89. )