download_all_domains.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. """Downloads all domain data sets from FAOSTAT website."""
  2. from faostat_data_primap.download import (
  3. download_file,
  4. download_methodology,
  5. get_html_content,
  6. get_last_updated_date,
  7. unzip_file,
  8. )
  9. from faostat_data_primap.helper.definitions import domains, downloaded_data_path
  10. def download_all_domains(
  11. domains: list[tuple[str]], downloaded_data_path: str = downloaded_data_path
  12. ) -> list[str]:
  13. """
  14. Download and unpack all climate-related domains from the FAO stat website.
  15. Extract the date when the data set was last updated and create a directory
  16. with the same name. Download the zip files for each domain if
  17. it does not already exist. Unpack the zip file and save in
  18. the same directory.
  19. Parameters
  20. ----------
  21. sources
  22. Name of data set, url to domain overview,
  23. and download url
  24. Returns
  25. -------
  26. List of input files that have been fetched or found locally.
  27. """
  28. downloaded_files = []
  29. for ds_name, urls in domains.items():
  30. url = urls["url_domain"]
  31. url_download = urls["url_download"]
  32. url_methodology = urls["url_methodology"]
  33. soup = get_html_content(url)
  34. last_updated = get_last_updated_date(soup, url)
  35. if not downloaded_data_path.exists():
  36. downloaded_data_path.mkdir()
  37. ds_path = downloaded_data_path / ds_name
  38. if not ds_path.exists():
  39. ds_path.mkdir()
  40. local_data_dir = ds_path / last_updated
  41. if not local_data_dir.exists():
  42. local_data_dir.mkdir()
  43. download_methodology(save_path=local_data_dir, url_download=url_methodology)
  44. local_filename = local_data_dir / f"{ds_name}.zip"
  45. download_file(url_download=url_download, save_path=local_filename)
  46. downloaded_files.append(str(local_filename))
  47. unzip_file(local_filename)
  48. return downloaded_files
  49. if __name__ == "__main__":
  50. download_all_domains(domains)