Browse Source

[DATALAD] Recorded changes

Daniel Busch 4 months ago
parent
commit
5a735ba16c

+ 58 - 63
scripts/download_all_domains.py

@@ -1,70 +1,65 @@
 """Downloads all domain data sets from FAOSTAT website."""
 
+
 from faostat_data_primap.download import (
-    download_file,
-    download_methodology,
-    get_html_content,
-    get_last_updated_date,
-    unzip_file,
+    download_all_domains,
 )
-from faostat_data_primap.helper.definitions import domains, downloaded_data_path
-
-
-def download_all_domains(
-    domains: list[tuple[str]], downloaded_data_path: str = downloaded_data_path
-) -> list[str]:
-    """
-    Download and unpack all climate-related domains from the FAO stat website.
-
-    Extract the date when the data set was last updated and create a directory
-    with the same name. Download the zip files for each domain if
-    it does not already exist. Unpack the zip file and save in
-    the same directory.
-
-    Parameters
-    ----------
-    sources
-        Name of data set, url to domain overview,
-        and download url
-
-    Returns
-    -------
-        List of input files that have been fetched or found locally.
-
-    """
-    downloaded_files = []
-    for ds_name, urls in domains.items():
-        url = urls["url_domain"]
-        url_download = urls["url_download"]
-        url_methodology = urls["url_methodology"]
-
-        soup = get_html_content(url)
-
-        last_updated = get_last_updated_date(soup, url)
-
-        if not downloaded_data_path.exists():
-            downloaded_data_path.mkdir()
-
-        ds_path = downloaded_data_path / ds_name
-        if not ds_path.exists():
-            ds_path.mkdir()
-
-        local_data_dir = ds_path / last_updated
-        if not local_data_dir.exists():
-            local_data_dir.mkdir()
-
-        download_methodology(save_path=local_data_dir, url_download=url_methodology)
-
-        local_filename = local_data_dir / f"{ds_name}.zip"
-
-        download_file(url_download=url_download, save_path=local_filename)
-
-        downloaded_files.append(str(local_filename))
-
-        unzip_file(local_filename)
-
-    return downloaded_files
 
+# def download_all_domains(
+#     domains: list[tuple[str]] = domains,
+#     downloaded_data_path: str = downloaded_data_path,
+# ) -> list[str]:
+#     """
+#     Download and unpack all climate-related domains from the FAO stat website.
+#
+#     Extract the date when the data set was last updated and create a directory
+#     with the same name. Download the zip files for each domain if
+#     it does not already exist. Unpack the zip file and save in
+#     the same directory.
+#
+#     Parameters
+#     ----------
+#     sources
+#         Name of data set, url to domain overview,
+#         and download url
+#
+#     Returns
+#     -------
+#         List of input files that have been fetched or found locally.
+#
+#     """
+#     downloaded_files = []
+#     for ds_name, urls in domains.items():
+#         url = urls["url_domain"]
+#         url_download = urls["url_download"]
+#         url_methodology = urls["url_methodology"]
+#
+#         soup = get_html_content(url)
+#
+#         last_updated = get_last_updated_date(soup, url)
+#
+#         if not downloaded_data_path.exists():
+#             downloaded_data_path.mkdir()
+#
+#         ds_path = downloaded_data_path / ds_name
+#         if not ds_path.exists():
+#             ds_path.mkdir()
+#
+#         local_data_dir = ds_path / last_updated
+#         if not local_data_dir.exists():
+#             local_data_dir.mkdir()
+#
+#         download_methodology(save_path=local_data_dir, url_download=url_methodology)
+#
+#         local_filename = local_data_dir / f"{ds_name}.zip"
+#
+#         download_file(url_download=url_download, save_path=local_filename)
+#
+#         downloaded_files.append(str(local_filename))
+#
+#         unzip_file(local_filename)
+#
+#     return downloaded_files
 
 if __name__ == "__main__":
-    download_all_domains(domains)
+    download_all_domains()

+ 62 - 1
src/faostat_data_primap/download.py

@@ -11,9 +11,11 @@ import bs4
 import requests
 from bs4 import BeautifulSoup
 from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 
 from faostat_data_primap.exceptions import DateTagNotFoundError
+from faostat_data_primap.helper.definitions import domains, downloaded_data_path
 
 
 def find_previous_release_path(
@@ -172,7 +174,9 @@ def get_html_content(url: str) -> bs4.BeautifulSoup:
     # will automatically download it for you. Make sure there is no
     # chromedriver installed on your system.
     service = Service()
-    driver = webdriver.Chrome(service=service)
+    options = Options()
+    options.add_argument("--headless")
+    driver = webdriver.Chrome(service=service, options=options)
 
     driver.get(url)
 
@@ -288,3 +292,60 @@ def unzip_file(local_filename: pathlib.PosixPath):
     else:
         print(f"Not attempting to extract " f"{local_filename}.")
     return unzipped_files
+
+
+def download_all_domains(
+    domains: list[tuple[str]] = domains,
+    downloaded_data_path: str = downloaded_data_path,
+) -> list[str]:
+    """
+    Download and unpack all climate-related domains from the FAO stat website.
+
+    Extract the date when the data set was last updated and create a directory
+    with the same name. Download the zip files for each domain if
+    it does not already exist. Unpack the zip file and save in
+    the same directory.
+
+    Parameters
+    ----------
+    sources
+        Name of data set, url to domain overview,
+        and download url
+
+    Returns
+    -------
+        List of input files that have been fetched or found locally.
+
+    """
+    downloaded_files = []
+    for ds_name, urls in domains.items():
+        url = urls["url_domain"]
+        url_download = urls["url_download"]
+        url_methodology = urls["url_methodology"]
+
+        soup = get_html_content(url)
+
+        last_updated = get_last_updated_date(soup, url)
+
+        if not downloaded_data_path.exists():
+            downloaded_data_path.mkdir()
+
+        ds_path = downloaded_data_path / ds_name
+        if not ds_path.exists():
+            ds_path.mkdir()
+
+        local_data_dir = ds_path / last_updated
+        if not local_data_dir.exists():
+            local_data_dir.mkdir()
+
+        download_methodology(save_path=local_data_dir, url_download=url_methodology)
+
+        local_filename = local_data_dir / f"{ds_name}.zip"
+
+        download_file(url_download=url_download, save_path=local_filename)
+
+        downloaded_files.append(str(local_filename))
+
+        unzip_file(local_filename)
+
+    return downloaded_files

+ 32 - 0
tests/integration/download_script.py

@@ -0,0 +1,32 @@
+import os
+
+from src.faostat_data_primap.download import download_all_domains
+
+
+# test the whole download script run
+def test_download_all_domains(tmp_path):
+    downloaded_data_path = tmp_path / "downloaded_data"
+    download_all_domains(downloaded_data_path=downloaded_data_path)
+
+    expected_downloaded_domains = [
+        "farm_gate_emissions_crops",
+        "farm_gate_livestock",
+        "farm_gate_agriculture_energy",
+        "land_use_forests",
+        "land_use_fires",
+        "land_use_drained_organic_soils",
+        "pre_post_agricultural_production",
+    ]
+
+    domains = []
+    for domain in downloaded_data_path.iterdir():
+        if domain.is_dir():
+            domains.append(domain.name)
+        for release in domain.iterdir():
+            downloaded_data = os.listdir(release)
+            # make sure we have at least one .csv, one .pdf and one .zip file
+            assert [f for f in downloaded_data if f.endswith(".csv")]
+            assert [f for f in downloaded_data if f.endswith(".pdf")]
+            assert [f for f in downloaded_data if f.endswith(".zip")]
+
+    assert sorted(expected_downloaded_domains) == sorted(domains)