download_all_domains.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. """Downloads all domain data sets from FAOSTAT website."""
  2. from faostat_data_primap.download import (
  3. download_file,
  4. download_methodology,
  5. get_html_content,
  6. get_last_updated_date,
  7. unzip_file,
  8. )
  9. from faostat_data_primap.helper.definitions import domains, downloaded_data_path
  10. def download_all_domains(sources: list[tuple[str]]) -> list[str]:
  11. """
  12. Download and unpack all climate-related domains from the FAO stat website.
  13. Extract the date when the data set was last updated and create a directory
  14. with the same name. Download the zip files for each domain if
  15. it does not already exist. Unpack the zip file and save in
  16. the same directory.
  17. Parameters
  18. ----------
  19. sources
  20. Name of data set, url to domain overview,
  21. and download url
  22. Returns
  23. -------
  24. List of input files that have been fetched or found locally.
  25. """
  26. downloaded_files = []
  27. for ds_name, urls in domains.items():
  28. url = urls["url_domain"]
  29. url_download = urls["url_download"]
  30. url_methodology = urls["url_methodology"]
  31. soup = get_html_content(url)
  32. last_updated = get_last_updated_date(soup, url)
  33. if not downloaded_data_path.exists():
  34. downloaded_data_path.mkdir()
  35. ds_path = downloaded_data_path / ds_name
  36. if not ds_path.exists():
  37. ds_path.mkdir()
  38. local_data_dir = ds_path / last_updated
  39. if not local_data_dir.exists():
  40. local_data_dir.mkdir()
  41. download_methodology(save_path=local_data_dir, url_download=url_methodology)
  42. local_filename = local_data_dir / f"{ds_name}.zip"
  43. download_file(url_download=url_download, save_path=local_filename)
  44. downloaded_files.append(str(local_filename))
  45. unzip_file(local_filename)
  46. return downloaded_files
  47. if __name__ == "__main__":
  48. download_all_domains(domains)