download_all_domains.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. """Downloads all domain data sets from FAOSTAT website."""
  2. from src.faostat_data_primap.download import (
  3. download_file,
  4. get_html_content,
  5. get_last_updated_date,
  6. unzip_file,
  7. )
  8. from src.faostat_data_primap.helper.definitions import downloaded_data_path, sources
  9. def download_all_domains(sources: list[tuple[str]]) -> list[str]:
  10. """
  11. Download and unpack all climate-related domains from the FAO stat website.
  12. Extract the date when the data set was last updated and create a directory
  13. with the same name. Download the zip files for each domain if
  14. it does not already exist. Unpack the zip file and save in
  15. the same directory.
  16. Parameters
  17. ----------
  18. sources
  19. Name of data set, url to domain overview,
  20. and download url
  21. Returns
  22. -------
  23. List of input files that have been fetched or found locally.
  24. """
  25. downloaded_files = []
  26. for (
  27. ds_name,
  28. url,
  29. url_download,
  30. ) in sources:
  31. soup = get_html_content(url)
  32. last_updated = get_last_updated_date(soup, url)
  33. if not downloaded_data_path.exists():
  34. downloaded_data_path.mkdir()
  35. ds_path = downloaded_data_path / ds_name
  36. if not ds_path.exists():
  37. ds_path.mkdir()
  38. local_data_dir = ds_path / last_updated
  39. if not local_data_dir.exists():
  40. local_data_dir.mkdir()
  41. local_filename = local_data_dir / f"{ds_name}.zip"
  42. download_file(url_download=url_download, save_path=local_filename)
  43. downloaded_files.append(str(local_filename))
  44. unzip_file(local_filename)
  45. return downloaded_files
  46. if __name__ == "__main__":
  47. download_all_domains(sources)