5 months ago · 03af9dafe9
--- a/scripts/download_all_domains.py
+++ b/scripts/download_all_domains.py
@@ -1,50 +1,30 @@
 
				 """Downloads all domain data sets from FAOSTAT website."""
			
 
				 
			
 
				-import zipfile
			
 
				-
			
 
				-import requests
			
 
				-
			
 
				-from src.faostat_data_primap.download import get_html_content, get_last_updated_date
			
 
				-from src.faostat_data_primap.helper.definitions import downloaded_data_path, root_path
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    sources = [
			
 
				-        (
			
 
				-            "farm_gate_emissions_crops",
			
 
				-            "https://www.fao.org/faostat/en/#data/GCE",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "farm_gate_livestock",
			
 
				-            "https://www.fao.org/faostat/en/#data/GLE",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "farm_gate_agriculture_energy",
			
 
				-            "https://www.fao.org/faostat/en/#data/GN",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_forests",
			
 
				-            "https://www.fao.org/faostat/en/#data/GF",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_fires",
			
 
				-            "https://www.fao.org/faostat/en/#data/GI",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_drained_organic_soils",
			
 
				-            "https://www.fao.org/faostat/en/#data/GV",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "pre_post_agricultural_production",
			
 
				-            "https://www.fao.org/faostat/en/#data/GPP",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				-        ),
			
 
				-    ]
			
 
				+from src.faostat_data_primap.download import (
			
 
				+    download_file,
			
 
				+    get_html_content,
			
 
				+    get_last_updated_date,
			
 
				+    unzip_file,
			
 
				+)
			
 
				+from src.faostat_data_primap.helper.definitions import downloaded_data_path, sources
			
 
				+
			
 
				+
			
 
				+def download_all_domains(sources: list[tuple[str]]):
			
 
				+    """
			
 
				+    Download input files from a remote location
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    download_path
			
 
				+        Name of data set, url to domain overview,
			
 
				+        and download url
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        List of input files that have been fetched or found locally.
			
 
				+
			
 
				+    """
			
 
				+    downloaded_files = []
			
 
				     for (
			
 
				         ds_name,
			
 
				         url,
			
@@ -52,6 +32,7 @@ if __name__ == "__main__":
 
				     ) in sources:
			
 
				         soup = get_html_content(url)
			
 
				 
			
 
				+        # todo Remove url input
			
 
				         last_updated = get_last_updated_date(soup, url)
			
 
				 
			
 
				         if not downloaded_data_path.exists():
			
@@ -68,35 +49,14 @@ if __name__ == "__main__":
 
				 
			
 
				         local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				 
			
 
				-        response = requests.get(url_download, timeout=20)
			
 
				-        response.raise_for_status()
			
 
				-
			
 
				-        # will overwrite existing file
			
 
				-        with open(local_filename, mode="wb") as file:
			
 
				-            file.write(response.content)
			
 
				-
			
 
				-        if local_filename.exists():
			
 
				-            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				-            # unzip data (only for new downloads)
			
 
				-            if local_filename.suffix == ".zip":
			
 
				-                try:
			
 
				-                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
			
 
				-                    zipped_file.extractall(str(local_filename.parent))
			
 
				-                    print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				-                    zipped_file.close()
			
 
				-                    # os.remove(local_filename)
			
 
				-                # TODO Better error logging/visibilty
			
 
				-                except zipfile.BadZipFile:
			
 
				-                    print(
			
 
				-                        f"Error while trying to extract "
			
 
				-                        f"{local_filename.relative_to(root_path)}"
			
 
				-                    )
			
 
				-                except NotImplementedError:
			
 
				-                    print(
			
 
				-                        "Zip format not supported, " "please unzip on the command line."
			
 
				-                    )
			
 
				-            else:
			
 
				-                print(
			
 
				-                    f"Not attempting to extract "
			
 
				-                    f"{local_filename.relative_to(root_path)}."
			
 
				-                )
			
 
				+        download_file(url_download=url_download, save_path=local_filename)
			
 
				+
			
 
				+        downloaded_files.append(str(local_filename))
			
 
				+
			
 
				+        unzip_file(local_filename)
			
 
				+
			
 
				+    return downloaded_files
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    download_all_domains(sources)
			
--- a/scripts/remove_downloads.py
+++ b/scripts/remove_downloads.py
@@ -9,30 +9,41 @@ structure or maybe can be deleted altogether later.
 
				 
			
 
				 import os
			
 
				 
			
 
				-import click
			
 
				-
			
 
				+# import click
			
 
				 from faostat_data_primap.helper.definitions import downloaded_data_path
			
 
				 
			
 
				 
			
 
				-@click.command()
			
 
				-@click.option(
			
 
				-    "--date",
			
 
				-    help="The day on which the data to be deleted was downloaded",
			
 
				-    default="2023-11-09",
			
 
				-)
			
 
				-def run(date: str):
			
 
				+# @click.command()
			
 
				+# @click.option(
			
 
				+#     "--level",
			
 
				+#     help="Delete all files on domain or release level",
			
 
				+#     default="domain",
			
 
				+# )
			
 
				+def run():
			
 
				     """
			
 
				-    Delete all downloaded files for one day.
			
 
				+    Delete all downloaded files for all domains and all releases
			
 
				     """
			
 
				-    domains = os.listdir(downloaded_data_path)
			
 
				+    domains = [
			
 
				+        d
			
 
				+        for d in os.listdir(downloaded_data_path)
			
 
				+        if os.path.isdir(downloaded_data_path / d)
			
 
				+    ]
			
 
				 
			
 
				     for domain in domains:
			
 
				-        path_to_files = downloaded_data_path / domain / date
			
 
				-        files_to_delete = os.listdir(path_to_files)
			
 
				-
			
 
				-        for file in files_to_delete:
			
 
				-            path_to_file = path_to_files / file
			
 
				-            os.remove(path_to_file)
			
 
				+        path_to_releases = downloaded_data_path / domain
			
 
				+        releases = [
			
 
				+            d
			
 
				+            for d in os.listdir(path_to_releases)
			
 
				+            if os.path.isdir(path_to_releases / d)
			
 
				+        ]
			
 
				+
			
 
				+        for release in releases:
			
 
				+            path_to_files = downloaded_data_path / domain / release
			
 
				+            files_to_delete = os.listdir(path_to_files)
			
 
				+
			
 
				+            for file in files_to_delete:
			
 
				+                path_to_file = path_to_files / file
			
 
				+                os.remove(path_to_file)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/src/faostat_data_primap/download.py
+++ b/src/faostat_data_primap/download.py
@@ -1,8 +1,10 @@
 
				 """Downloads data from FAOSTAT website."""
			
 
				 
			
 
				 import time
			
 
				+import zipfile
			
 
				 from datetime import datetime
			
 
				 
			
 
				+import requests
			
 
				 from bs4 import BeautifulSoup
			
 
				 
			
 
				 # from helper.definitions import downloaded_data_path, root_path
			
@@ -63,3 +65,58 @@ def get_last_updated_date(soup, url):
 
				     last_updated = date_tag.get_text()
			
 
				     last_updated = datetime.strptime(last_updated, "%B %d, %Y").strftime("%Y-%m-%d")
			
 
				     return last_updated
			
 
				+
			
 
				+
			
 
				+def download_file(url_download, save_path):
			
 
				+    """
			
 
				+    todo
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    url_download
			
 
				+    save_path
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        True if the file was downloaded, False if a cached file was found
			
 
				+    """
			
 
				+    if not save_path.exists():
			
 
				+        response = requests.get(url_download, timeout=20)
			
 
				+        response.raise_for_status()
			
 
				+
			
 
				+        # will overwrite existing file
			
 
				+        with open(save_path, mode="wb") as file:
			
 
				+            file.write(response.content)
			
 
				+        return True
			
 
				+    else:
			
 
				+        print(f"Skipping {save_path}" " because it already exists.")
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def unzip_file(local_filename):
			
 
				+    """
			
 
				+    todo
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    local_filename
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        List of unzipped files
			
 
				+    """
			
 
				+    # unzip data (only for new downloads)
			
 
				+    if local_filename.suffix == ".zip":
			
 
				+        try:
			
 
				+            # TODO check if unzipped files already there
			
 
				+            zipped_file = zipfile.ZipFile(str(local_filename), "r")
			
 
				+            zipped_file.extractall(str(local_filename.parent))
			
 
				+            print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				+            zipped_file.close()
			
 
				+        # TODO Better error logging/visibilty
			
 
				+        except zipfile.BadZipFile:
			
 
				+            print(f"Error while trying to extract " f"{local_filename}")
			
 
				+        except NotImplementedError:
			
 
				+            print("Zip format not supported, " "please unzip on the command line.")
			
 
				+    else:
			
 
				+        print(f"Not attempting to extract " f"{local_filename}.")
			
--- a/src/faostat_data_primap/helper/definitions.py
+++ b/src/faostat_data_primap/helper/definitions.py
@@ -2,6 +2,44 @@
 
				 
			
 
				 from pathlib import Path
			
 
				 
			
 
				+sources = [
			
 
				+    (
			
 
				+        "farm_gate_emissions_crops",
			
 
				+        "https://www.fao.org/faostat/en/#data/GCE",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "farm_gate_livestock",
			
 
				+        "https://www.fao.org/faostat/en/#data/GLE",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "farm_gate_agriculture_energy",
			
 
				+        "https://www.fao.org/faostat/en/#data/GN",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "land_use_forests",
			
 
				+        "https://www.fao.org/faostat/en/#data/GF",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "land_use_fires",
			
 
				+        "https://www.fao.org/faostat/en/#data/GI",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "land_use_drained_organic_soils",
			
 
				+        "https://www.fao.org/faostat/en/#data/GV",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				+    ),
			
 
				+    (
			
 
				+        "pre_post_agricultural_production",
			
 
				+        "https://www.fao.org/faostat/en/#data/GPP",
			
 
				+        "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				+    ),
			
 
				+]
			
 
				+
			
 
				 
			
 
				 def get_root_path(root_indicator: str = ".git"):
			
 
				     """