4 months ago · 5a735ba16c
--- a/scripts/download_all_domains.py
+++ b/scripts/download_all_domains.py
@@ -1,70 +1,65 @@
 
				 """Downloads all domain data sets from FAOSTAT website."""
			
 
				 
			
 
				+
			
 
				 from faostat_data_primap.download import (
			
 
				-    download_file,
			
 
				-    download_methodology,
			
 
				-    get_html_content,
			
 
				-    get_last_updated_date,
			
 
				-    unzip_file,
			
 
				+    download_all_domains,
			
 
				 )
			
 
				-from faostat_data_primap.helper.definitions import domains, downloaded_data_path
			
 
				-
			
 
				-
			
 
				-def download_all_domains(
			
 
				-    domains: list[tuple[str]], downloaded_data_path: str = downloaded_data_path
			
 
				-) -> list[str]:
			
 
				-    """
			
 
				-    Download and unpack all climate-related domains from the FAO stat website.
			
 
				-
			
 
				-    Extract the date when the data set was last updated and create a directory
			
 
				-    with the same name. Download the zip files for each domain if
			
 
				-    it does not already exist. Unpack the zip file and save in
			
 
				-    the same directory.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-    sources
			
 
				-        Name of data set, url to domain overview,
			
 
				-        and download url
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-        List of input files that have been fetched or found locally.
			
 
				-
			
 
				-    """
			
 
				-    downloaded_files = []
			
 
				-    for ds_name, urls in domains.items():
			
 
				-        url = urls["url_domain"]
			
 
				-        url_download = urls["url_download"]
			
 
				-        url_methodology = urls["url_methodology"]
			
 
				-
			
 
				-        soup = get_html_content(url)
			
 
				-
			
 
				-        last_updated = get_last_updated_date(soup, url)
			
 
				-
			
 
				-        if not downloaded_data_path.exists():
			
 
				-            downloaded_data_path.mkdir()
			
 
				-
			
 
				-        ds_path = downloaded_data_path / ds_name
			
 
				-        if not ds_path.exists():
			
 
				-            ds_path.mkdir()
			
 
				-
			
 
				-        local_data_dir = ds_path / last_updated
			
 
				-        if not local_data_dir.exists():
			
 
				-            local_data_dir.mkdir()
			
 
				-
			
 
				-        download_methodology(save_path=local_data_dir, url_download=url_methodology)
			
 
				-
			
 
				-        local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				-
			
 
				-        download_file(url_download=url_download, save_path=local_filename)
			
 
				-
			
 
				-        downloaded_files.append(str(local_filename))
			
 
				-
			
 
				-        unzip_file(local_filename)
			
 
				-
			
 
				-    return downloaded_files
			
 
				 
			
 
				+# def download_all_domains(
			
 
				+#     domains: list[tuple[str]] = domains,
			
 
				+#     downloaded_data_path: str = downloaded_data_path,
			
 
				+# ) -> list[str]:
			
 
				+#     """
			
 
				+#     Download and unpack all climate-related domains from the FAO stat website.
			
 
				+#
			
 
				+#     Extract the date when the data set was last updated and create a directory
			
 
				+#     with the same name. Download the zip files for each domain if
			
 
				+#     it does not already exist. Unpack the zip file and save in
			
 
				+#     the same directory.
			
 
				+#
			
 
				+#     Parameters
			
 
				+#     ----------
			
 
				+#     sources
			
 
				+#         Name of data set, url to domain overview,
			
 
				+#         and download url
			
 
				+#
			
 
				+#     Returns
			
 
				+#     -------
			
 
				+#         List of input files that have been fetched or found locally.
			
 
				+#
			
 
				+#     """
			
 
				+#     downloaded_files = []
			
 
				+#     for ds_name, urls in domains.items():
			
 
				+#         url = urls["url_domain"]
			
 
				+#         url_download = urls["url_download"]
			
 
				+#         url_methodology = urls["url_methodology"]
			
 
				+#
			
 
				+#         soup = get_html_content(url)
			
 
				+#
			
 
				+#         last_updated = get_last_updated_date(soup, url)
			
 
				+#
			
 
				+#         if not downloaded_data_path.exists():
			
 
				+#             downloaded_data_path.mkdir()
			
 
				+#
			
 
				+#         ds_path = downloaded_data_path / ds_name
			
 
				+#         if not ds_path.exists():
			
 
				+#             ds_path.mkdir()
			
 
				+#
			
 
				+#         local_data_dir = ds_path / last_updated
			
 
				+#         if not local_data_dir.exists():
			
 
				+#             local_data_dir.mkdir()
			
 
				+#
			
 
				+#         download_methodology(save_path=local_data_dir, url_download=url_methodology)
			
 
				+#
			
 
				+#         local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				+#
			
 
				+#         download_file(url_download=url_download, save_path=local_filename)
			
 
				+#
			
 
				+#         downloaded_files.append(str(local_filename))
			
 
				+#
			
 
				+#         unzip_file(local_filename)
			
 
				+#
			
 
				+#     return downloaded_files
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    download_all_domains(domains)
			
 
				+    download_all_domains()
			
--- a/src/faostat_data_primap/download.py
+++ b/src/faostat_data_primap/download.py
@@ -11,9 +11,11 @@ import bs4
 
				 import requests
			
 
				 from bs4 import BeautifulSoup
			
 
				 from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				 from selenium.webdriver.chrome.service import Service
			
 
				 
			
 
				 from faostat_data_primap.exceptions import DateTagNotFoundError
			
 
				+from faostat_data_primap.helper.definitions import domains, downloaded_data_path
			
 
				 
			
 
				 
			
 
				 def find_previous_release_path(
			
@@ -172,7 +174,9 @@ def get_html_content(url: str) -> bs4.BeautifulSoup:
 
				     # will automatically download it for you. Make sure there is no
			
 
				     # chromedriver installed on your system.
			
 
				     service = Service()
			
 
				-    driver = webdriver.Chrome(service=service)
			
 
				+    options = Options()
			
 
				+    options.add_argument("--headless")
			
 
				+    driver = webdriver.Chrome(service=service, options=options)
			
 
				 
			
 
				     driver.get(url)
			
 
				 
			
@@ -288,3 +292,60 @@ def unzip_file(local_filename: pathlib.PosixPath):
 
				     else:
			
 
				         print(f"Not attempting to extract " f"{local_filename}.")
			
 
				     return unzipped_files
			
 
				+
			
 
				+
			
 
				+def download_all_domains(
			
 
				+    domains: list[tuple[str]] = domains,
			
 
				+    downloaded_data_path: str = downloaded_data_path,
			
 
				+) -> list[str]:
			
 
				+    """
			
 
				+    Download and unpack all climate-related domains from the FAO stat website.
			
 
				+
			
 
				+    Extract the date when the data set was last updated and create a directory
			
 
				+    with the same name. Download the zip files for each domain if
			
 
				+    it does not already exist. Unpack the zip file and save in
			
 
				+    the same directory.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    sources
			
 
				+        Name of data set, url to domain overview,
			
 
				+        and download url
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        List of input files that have been fetched or found locally.
			
 
				+
			
 
				+    """
			
 
				+    downloaded_files = []
			
 
				+    for ds_name, urls in domains.items():
			
 
				+        url = urls["url_domain"]
			
 
				+        url_download = urls["url_download"]
			
 
				+        url_methodology = urls["url_methodology"]
			
 
				+
			
 
				+        soup = get_html_content(url)
			
 
				+
			
 
				+        last_updated = get_last_updated_date(soup, url)
			
 
				+
			
 
				+        if not downloaded_data_path.exists():
			
 
				+            downloaded_data_path.mkdir()
			
 
				+
			
 
				+        ds_path = downloaded_data_path / ds_name
			
 
				+        if not ds_path.exists():
			
 
				+            ds_path.mkdir()
			
 
				+
			
 
				+        local_data_dir = ds_path / last_updated
			
 
				+        if not local_data_dir.exists():
			
 
				+            local_data_dir.mkdir()
			
 
				+
			
 
				+        download_methodology(save_path=local_data_dir, url_download=url_methodology)
			
 
				+
			
 
				+        local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				+
			
 
				+        download_file(url_download=url_download, save_path=local_filename)
			
 
				+
			
 
				+        downloaded_files.append(str(local_filename))
			
 
				+
			
 
				+        unzip_file(local_filename)
			
 
				+
			
 
				+    return downloaded_files
			
--- a/tests/integration/download_script.py
+++ b/tests/integration/download_script.py
@@ -0,0 +1,32 @@
 
				+import os
			
 
				+
			
 
				+from src.faostat_data_primap.download import download_all_domains
			
 
				+
			
 
				+
			
 
				+# test the whole download script run
			
 
				+def test_download_all_domains(tmp_path):
			
 
				+    downloaded_data_path = tmp_path / "downloaded_data"
			
 
				+    download_all_domains(downloaded_data_path=downloaded_data_path)
			
 
				+
			
 
				+    expected_downloaded_domains = [
			
 
				+        "farm_gate_emissions_crops",
			
 
				+        "farm_gate_livestock",
			
 
				+        "farm_gate_agriculture_energy",
			
 
				+        "land_use_forests",
			
 
				+        "land_use_fires",
			
 
				+        "land_use_drained_organic_soils",
			
 
				+        "pre_post_agricultural_production",
			
 
				+    ]
			
 
				+
			
 
				+    domains = []
			
 
				+    for domain in downloaded_data_path.iterdir():
			
 
				+        if domain.is_dir():
			
 
				+            domains.append(domain.name)
			
 
				+        for release in domain.iterdir():
			
 
				+            downloaded_data = os.listdir(release)
			
 
				+            # make sure we have at least one .csv, one .pdf and one .zip file
			
 
				+            assert [f for f in downloaded_data if f.endswith(".csv")]
			
 
				+            assert [f for f in downloaded_data if f.endswith(".pdf")]
			
 
				+            assert [f for f in downloaded_data if f.endswith(".zip")]
			
 
				+
			
 
				+    assert sorted(expected_downloaded_domains) == sorted(domains)