Browse Source

refactor [skip ci]

Daniel Busch 5 months ago
parent
commit
03af9dafe9

+ 37 - 77
scripts/download_all_domains.py

@@ -1,50 +1,30 @@
 """Downloads all domain data sets from FAOSTAT website."""
 
-import zipfile
-
-import requests
-
-from src.faostat_data_primap.download import get_html_content, get_last_updated_date
-from src.faostat_data_primap.helper.definitions import downloaded_data_path, root_path
-
-if __name__ == "__main__":
-    sources = [
-        (
-            "farm_gate_emissions_crops",
-            "https://www.fao.org/faostat/en/#data/GCE",
-            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
-        ),
-        (
-            "farm_gate_livestock",
-            "https://www.fao.org/faostat/en/#data/GLE",
-            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
-        ),
-        (
-            "farm_gate_agriculture_energy",
-            "https://www.fao.org/faostat/en/#data/GN",
-            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
-        ),
-        (
-            "land_use_forests",
-            "https://www.fao.org/faostat/en/#data/GF",
-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
-        ),
-        (
-            "land_use_fires",
-            "https://www.fao.org/faostat/en/#data/GI",
-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
-        ),
-        (
-            "land_use_drained_organic_soils",
-            "https://www.fao.org/faostat/en/#data/GV",
-            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
-        ),
-        (
-            "pre_post_agricultural_production",
-            "https://www.fao.org/faostat/en/#data/GPP",
-            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
-        ),
-    ]
+from src.faostat_data_primap.download import (
+    download_file,
+    get_html_content,
+    get_last_updated_date,
+    unzip_file,
+)
+from src.faostat_data_primap.helper.definitions import downloaded_data_path, sources
+
+
+def download_all_domains(sources: list[tuple[str]]):
+    """
+    Download input files from a remote location
+
+    Parameters
+    ----------
+    download_path
+        Name of data set, url to domain overview,
+        and download url
+
+    Returns
+    -------
+        List of input files that have been fetched or found locally.
+
+    """
+    downloaded_files = []
     for (
         ds_name,
         url,
@@ -52,6 +32,7 @@ if __name__ == "__main__":
     ) in sources:
         soup = get_html_content(url)
 
+        # todo Remove url input
         last_updated = get_last_updated_date(soup, url)
 
         if not downloaded_data_path.exists():
@@ -68,35 +49,14 @@ if __name__ == "__main__":
 
         local_filename = local_data_dir / f"{ds_name}.zip"
 
-        response = requests.get(url_download, timeout=20)
-        response.raise_for_status()
-
-        # will overwrite existing file
-        with open(local_filename, mode="wb") as file:
-            file.write(response.content)
-
-        if local_filename.exists():
-            print(f"Download => {local_filename.relative_to(root_path)}")
-            # unzip data (only for new downloads)
-            if local_filename.suffix == ".zip":
-                try:
-                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
-                    zipped_file.extractall(str(local_filename.parent))
-                    print(f"Extracted {len(zipped_file.namelist())} files.")
-                    zipped_file.close()
-                    # os.remove(local_filename)
-                # TODO Better error logging/visibilty
-                except zipfile.BadZipFile:
-                    print(
-                        f"Error while trying to extract "
-                        f"{local_filename.relative_to(root_path)}"
-                    )
-                except NotImplementedError:
-                    print(
-                        "Zip format not supported, " "please unzip on the command line."
-                    )
-            else:
-                print(
-                    f"Not attempting to extract "
-                    f"{local_filename.relative_to(root_path)}."
-                )
+        download_file(url_download=url_download, save_path=local_filename)
+
+        downloaded_files.append(str(local_filename))
+
+        unzip_file(local_filename)
+
+    return downloaded_files
+
+
+if __name__ == "__main__":
+    download_all_domains(sources)

+ 28 - 17
scripts/remove_downloads.py

@@ -9,30 +9,41 @@ structure or maybe can be deleted altogether later.
 
 import os
 
-import click
-
+# import click
 from faostat_data_primap.helper.definitions import downloaded_data_path
 
 
-@click.command()
-@click.option(
-    "--date",
-    help="The day on which the data to be deleted was downloaded",
-    default="2023-11-09",
-)
-def run(date: str):
+# @click.command()
+# @click.option(
+#     "--level",
+#     help="Delete all files on domain or release level",
+#     default="domain",
+# )
+def run():
     """
-    Delete all downloaded files for one day.
+    Delete all downloaded files for all domains and all releases
     """
-    domains = os.listdir(downloaded_data_path)
+    domains = [
+        d
+        for d in os.listdir(downloaded_data_path)
+        if os.path.isdir(downloaded_data_path / d)
+    ]
 
     for domain in domains:
-        path_to_files = downloaded_data_path / domain / date
-        files_to_delete = os.listdir(path_to_files)
-
-        for file in files_to_delete:
-            path_to_file = path_to_files / file
-            os.remove(path_to_file)
+        path_to_releases = downloaded_data_path / domain
+        releases = [
+            d
+            for d in os.listdir(path_to_releases)
+            if os.path.isdir(path_to_releases / d)
+        ]
+
+        for release in releases:
+            path_to_files = downloaded_data_path / domain / release
+            files_to_delete = os.listdir(path_to_files)
+
+            for file in files_to_delete:
+                path_to_file = path_to_files / file
+                os.remove(path_to_file)
 
 
 if __name__ == "__main__":

+ 57 - 0
src/faostat_data_primap/download.py

@@ -1,8 +1,10 @@
 """Downloads data from FAOSTAT website."""
 
 import time
+import zipfile
 from datetime import datetime
 
+import requests
 from bs4 import BeautifulSoup
 
 # from helper.definitions import downloaded_data_path, root_path
@@ -63,3 +65,58 @@ def get_last_updated_date(soup, url):
     last_updated = date_tag.get_text()
     last_updated = datetime.strptime(last_updated, "%B %d, %Y").strftime("%Y-%m-%d")
     return last_updated
+
+
+def download_file(url_download, save_path):
+    """
+    todo
+
+    Parameters
+    ----------
+    url_download
+    save_path
+
+    Returns
+    -------
+        True if the file was downloaded, False if a cached file was found
+    """
+    if not save_path.exists():
+        response = requests.get(url_download, timeout=20)
+        response.raise_for_status()
+
+        # will overwrite existing file
+        with open(save_path, mode="wb") as file:
+            file.write(response.content)
+        return True
+    else:
+        print(f"Skipping {save_path}" " because it already exists.")
+    return False
+
+
+def unzip_file(local_filename):
+    """
+    todo
+
+    Parameters
+    ----------
+    local_filename
+
+    Returns
+    -------
+        List of unzipped files
+    """
+    # unzip data (only for new downloads)
+    if local_filename.suffix == ".zip":
+        try:
+            # TODO check if unzipped files already there
+            zipped_file = zipfile.ZipFile(str(local_filename), "r")
+            zipped_file.extractall(str(local_filename.parent))
+            print(f"Extracted {len(zipped_file.namelist())} files.")
+            zipped_file.close()
+        # TODO Better error logging/visibilty
+        except zipfile.BadZipFile:
+            print(f"Error while trying to extract " f"{local_filename}")
+        except NotImplementedError:
+            print("Zip format not supported, " "please unzip on the command line.")
+    else:
+        print(f"Not attempting to extract " f"{local_filename}.")

+ 38 - 0
src/faostat_data_primap/helper/definitions.py

@@ -2,6 +2,44 @@
 
 from pathlib import Path
 
+sources = [
+    (
+        "farm_gate_emissions_crops",
+        "https://www.fao.org/faostat/en/#data/GCE",
+        "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
+    ),
+    (
+        "farm_gate_livestock",
+        "https://www.fao.org/faostat/en/#data/GLE",
+        "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
+    ),
+    (
+        "farm_gate_agriculture_energy",
+        "https://www.fao.org/faostat/en/#data/GN",
+        "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
+    ),
+    (
+        "land_use_forests",
+        "https://www.fao.org/faostat/en/#data/GF",
+        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
+    ),
+    (
+        "land_use_fires",
+        "https://www.fao.org/faostat/en/#data/GI",
+        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
+    ),
+    (
+        "land_use_drained_organic_soils",
+        "https://www.fao.org/faostat/en/#data/GV",
+        "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
+    ),
+    (
+        "pre_post_agricultural_production",
+        "https://www.fao.org/faostat/en/#data/GPP",
+        "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
+    ),
+]
+
 
 def get_root_path(root_indicator: str = ".git"):
     """