5 months ago · 2033652031
--- a/scripts/download_all_domains.py
+++ b/scripts/download_all_domains.py
@@ -2,11 +2,12 @@
 
				 
			
 
				 from faostat_data_primap.download import (
			
 
				     download_file,
			
 
				+    download_methodology,
			
 
				     get_html_content,
			
 
				     get_last_updated_date,
			
 
				     unzip_file,
			
 
				 )
			
 
				-from faostat_data_primap.helper.definitions import downloaded_data_path, sources
			
 
				+from faostat_data_primap.helper.definitions import domains, downloaded_data_path
			
 
				 
			
 
				 
			
 
				 def download_all_domains(sources: list[tuple[str]]) -> list[str]:
			
@@ -30,11 +31,11 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
 
				 
			
 
				     """
			
 
				     downloaded_files = []
			
 
				-    for (
			
 
				-        ds_name,
			
 
				-        url,
			
 
				-        url_download,
			
 
				-    ) in sources:
			
 
				+    for ds_name, urls in domains.items():
			
 
				+        url = urls["url_domain"]
			
 
				+        url_download = urls["url_download"]
			
 
				+        url_methodology = urls["url_methodology"]
			
 
				+
			
 
				         soup = get_html_content(url)
			
 
				 
			
 
				         last_updated = get_last_updated_date(soup, url)
			
@@ -50,6 +51,8 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
 
				         if not local_data_dir.exists():
			
 
				             local_data_dir.mkdir()
			
 
				 
			
 
				+        download_methodology(save_path=local_data_dir, url_download=url_methodology)
			
 
				+
			
 
				         local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				 
			
 
				         download_file(url_download=url_download, save_path=local_filename)
			
@@ -62,4 +65,4 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    download_all_domains(sources)
			
 
				+    download_all_domains(domains)
			
--- a/src/faostat_data_primap/download.py
+++ b/src/faostat_data_primap/download.py
@@ -1,5 +1,7 @@
 
				 """Downloads data from FAOSTAT website."""
			
 
				 
			
 
				+import hashlib
			
 
				+import os
			
 
				 import pathlib
			
 
				 import time
			
 
				 import zipfile
			
@@ -14,6 +16,81 @@ from selenium.webdriver.chrome.service import Service
 
				 from faostat_data_primap.exceptions import DateTagNotFoundError
			
 
				 
			
 
				 
			
 
				+def find_previous_release_path(current_relase_path: pathlib.PosixPath):
			
 
				+    # find the directory of the previous release
			
 
				+    domain_path = current_relase_path.parent
			
 
				+    releases = [
			
 
				+        release
			
 
				+        for release in os.listdir(domain_path)
			
 
				+        if (
			
 
				+            os.path.isdir(domain_path / release)
			
 
				+            and (domain_path / release != current_relase_path)
			
 
				+        )
			
 
				+    ]
			
 
				+    if not releases:
			
 
				+        return None
			
 
				+    previous_release = sorted(releases)[-1]
			
 
				+    previous_release_path = domain_path / previous_release
			
 
				+    return previous_release_path
			
 
				+
			
 
				+
			
 
				+def calculate_checksum(file_path):
			
 
				+    sha256 = hashlib.sha256()
			
 
				+    with open(file_path, "rb") as f:
			
 
				+        for chunk in iter(lambda: f.read(4096), b""):
			
 
				+            sha256.update(chunk)
			
 
				+    return sha256.hexdigest()
			
 
				+
			
 
				+
			
 
				+def download_methodology(url_download: str, save_path: pathlib.PosixPath):
			
 
				+    filename = str(url_download).split("/")[-1]
			
 
				+    download_path = save_path / filename
			
 
				+    # find file to compare with
			
 
				+    previous_release = find_previous_release_path(save_path)
			
 
				+    # check if the file already exists in current release and there are
			
 
				+    # files in previous release to compare with
			
 
				+    if not download_path.exists() and previous_release:
			
 
				+        file_to_compare = previous_release / filename
			
 
				+        # Check if the file exists in the comparison directory
			
 
				+        if file_to_compare.exists():
			
 
				+            # Download the file temporarily to calculate its checksum
			
 
				+            response = requests.get(url_download, stream=True)
			
 
				+            response.raise_for_status()  # Check for successful request
			
 
				+            file_to_download = response.content
			
 
				+            file_to_download_checksum = hashlib.sha256(file_to_download).hexdigest()
			
 
				+
			
 
				+            # If the file exists, compare checksums
			
 
				+            file_to_compare_checksum = calculate_checksum(file_to_compare)
			
 
				+
			
 
				+            if file_to_compare_checksum == file_to_download_checksum:
			
 
				+                # Files are the same, create a symlink
			
 
				+                print(
			
 
				+                    f"File '{filename}' exists in the comparison directory and "
			
 
				+                    f"is identical. Creating symlink."
			
 
				+                )
			
 
				+                os.symlink(file_to_compare, download_path)
			
 
				+            else:
			
 
				+                # Files are different, proceed to download
			
 
				+                print(
			
 
				+                    f"File '{filename}' exists in the comparison directory but differs. "
			
 
				+                    f"Downloading file."
			
 
				+                )
			
 
				+                with open(download_path, "wb") as f:
			
 
				+                    f.write(file_to_download)
			
 
				+        else:
			
 
				+            print(
			
 
				+                f"File '{filename}' does not exist in previous release. Downloading file."
			
 
				+            )
			
 
				+            response = requests.get(url_download, stream=True)
			
 
				+            response.raise_for_status()  # Check for successful request
			
 
				+            with open(download_path, "wb") as f:
			
 
				+                f.write(response.content)
			
 
				+
			
 
				+    # file already exists in current release
			
 
				+    else:
			
 
				+        print(f"Skipping download of {download_path}" " because it already exists.")
			
 
				+
			
 
				+
			
 
				 def get_html_content(url: str) -> bs4.BeautifulSoup:
			
 
				     """
			
 
				     Get html from url.
			
--- a/src/faostat_data_primap/helper/definitions.py
+++ b/src/faostat_data_primap/helper/definitions.py
@@ -2,43 +2,43 @@
 
				 
			
 
				 from pathlib import Path
			
 
				 
			
 
				-sources = [
			
 
				-    (
			
 
				-        "farm_gate_emissions_crops",
			
 
				-        "https://www.fao.org/faostat/en/#data/GCE",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "farm_gate_livestock",
			
 
				-        "https://www.fao.org/faostat/en/#data/GLE",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "farm_gate_agriculture_energy",
			
 
				-        "https://www.fao.org/faostat/en/#data/GN",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "land_use_forests",
			
 
				-        "https://www.fao.org/faostat/en/#data/GF",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "land_use_fires",
			
 
				-        "https://www.fao.org/faostat/en/#data/GI",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "land_use_drained_organic_soils",
			
 
				-        "https://www.fao.org/faostat/en/#data/GV",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				-    ),
			
 
				-    (
			
 
				-        "pre_post_agricultural_production",
			
 
				-        "https://www.fao.org/faostat/en/#data/GPP",
			
 
				-        "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				-    ),
			
 
				-]
			
 
				+domains = {
			
 
				+    "farm_gate_emissions_crops": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GCE",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GCE/GCE_e.pdf",
			
 
				+    },
			
 
				+    "farm_gate_livestock": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GLE",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GLE/GLE_e.pdf",
			
 
				+    },
			
 
				+    "farm_gate_agriculture_energy": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GN",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GN/GN_2023Oct_Final.pdf",
			
 
				+    },
			
 
				+    "land_use_forests": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GF",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GF/GF_e.pdf",
			
 
				+    },
			
 
				+    "land_use_fires": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GI",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GI/GI_e.pdf",
			
 
				+    },
			
 
				+    "land_use_drained_organic_soils": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GV",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GV/GV_e.pdf",
			
 
				+    },
			
 
				+    "pre_post_agricultural_production": {
			
 
				+        "url_domain": "https://www.fao.org/faostat/en/#data/GPP",
			
 
				+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				+        "url_methodology": "https://files-faostat.fao.org/production/GPP/README_Methodological_Note.pdf",
			
 
				+    },
			
 
				+}
			
 
				 
			
 
				 
			
 
				 def get_root_path(root_indicator: str = ".git"):
			
--- a/tests/unit/test_download.py
+++ b/tests/unit/test_download.py