Browse Source

download methodology document [skip ci]

Daniel Busch 5 months ago
parent
commit
2033652031

+ 10 - 7
scripts/download_all_domains.py

@@ -2,11 +2,12 @@
 
 from faostat_data_primap.download import (
     download_file,
+    download_methodology,
     get_html_content,
     get_last_updated_date,
     unzip_file,
 )
-from faostat_data_primap.helper.definitions import downloaded_data_path, sources
+from faostat_data_primap.helper.definitions import domains, downloaded_data_path
 
 
 def download_all_domains(sources: list[tuple[str]]) -> list[str]:
@@ -30,11 +31,11 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
 
     """
     downloaded_files = []
-    for (
-        ds_name,
-        url,
-        url_download,
-    ) in sources:
+    for ds_name, urls in domains.items():
+        url = urls["url_domain"]
+        url_download = urls["url_download"]
+        url_methodology = urls["url_methodology"]
+
         soup = get_html_content(url)
 
         last_updated = get_last_updated_date(soup, url)
@@ -50,6 +51,8 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
         if not local_data_dir.exists():
             local_data_dir.mkdir()
 
+        download_methodology(save_path=local_data_dir, url_download=url_methodology)
+
         local_filename = local_data_dir / f"{ds_name}.zip"
 
         download_file(url_download=url_download, save_path=local_filename)
@@ -62,4 +65,4 @@ def download_all_domains(sources: list[tuple[str]]) -> list[str]:
 
 
 if __name__ == "__main__":
-    download_all_domains(sources)
+    download_all_domains(domains)

+ 77 - 0
src/faostat_data_primap/download.py

@@ -1,5 +1,7 @@
 """Downloads data from FAOSTAT website."""
 
+import hashlib
+import os
 import pathlib
 import time
 import zipfile
@@ -14,6 +16,81 @@ from selenium.webdriver.chrome.service import Service
 from faostat_data_primap.exceptions import DateTagNotFoundError
 
 
+def find_previous_release_path(current_relase_path: pathlib.PosixPath):
+    # find the directory of the previous release
+    domain_path = current_relase_path.parent
+    releases = [
+        release
+        for release in os.listdir(domain_path)
+        if (
+            os.path.isdir(domain_path / release)
+            and (domain_path / release != current_relase_path)
+        )
+    ]
+    if not releases:
+        return None
+    previous_release = sorted(releases)[-1]
+    previous_release_path = domain_path / previous_release
+    return previous_release_path
+
+
+def calculate_checksum(file_path):
+    sha256 = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            sha256.update(chunk)
+    return sha256.hexdigest()
+
+
+def download_methodology(url_download: str, save_path: pathlib.PosixPath):
+    filename = str(url_download).split("/")[-1]
+    download_path = save_path / filename
+    # find file to compare with
+    previous_release = find_previous_release_path(save_path)
+    # check if the file already exists in current release and there are
+    # files in previous release to compare with
+    if not download_path.exists() and previous_release:
+        file_to_compare = previous_release / filename
+        # Check if the file exists in the comparison directory
+        if file_to_compare.exists():
+            # Download the file temporarily to calculate its checksum
+            response = requests.get(url_download, stream=True)
+            response.raise_for_status()  # Check for successful request
+            file_to_download = response.content
+            file_to_download_checksum = hashlib.sha256(file_to_download).hexdigest()
+
+            # If the file exists, compare checksums
+            file_to_compare_checksum = calculate_checksum(file_to_compare)
+
+            if file_to_compare_checksum == file_to_download_checksum:
+                # Files are the same, create a symlink
+                print(
+                    f"File '{filename}' exists in the comparison directory and "
+                    f"is identical. Creating symlink."
+                )
+                os.symlink(file_to_compare, download_path)
+            else:
+                # Files are different, proceed to download
+                print(
+                    f"File '{filename}' exists in the comparison directory but differs. "
+                    f"Downloading file."
+                )
+                with open(download_path, "wb") as f:
+                    f.write(file_to_download)
+        else:
+            print(
+                f"File '{filename}' does not exist in previous release. Downloading file."
+            )
+            response = requests.get(url_download, stream=True)
+            response.raise_for_status()  # Check for successful request
+            with open(download_path, "wb") as f:
+                f.write(response.content)
+
+    # file already exists in current release
+    else:
+        print(f"Skipping download of {download_path}" " because it already exists.")
+
+
 def get_html_content(url: str) -> bs4.BeautifulSoup:
     """
     Get html from url.

+ 37 - 37
src/faostat_data_primap/helper/definitions.py

@@ -2,43 +2,43 @@
 
 from pathlib import Path
 
-sources = [
-    (
-        "farm_gate_emissions_crops",
-        "https://www.fao.org/faostat/en/#data/GCE",
-        "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
-    ),
-    (
-        "farm_gate_livestock",
-        "https://www.fao.org/faostat/en/#data/GLE",
-        "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
-    ),
-    (
-        "farm_gate_agriculture_energy",
-        "https://www.fao.org/faostat/en/#data/GN",
-        "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
-    ),
-    (
-        "land_use_forests",
-        "https://www.fao.org/faostat/en/#data/GF",
-        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
-    ),
-    (
-        "land_use_fires",
-        "https://www.fao.org/faostat/en/#data/GI",
-        "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
-    ),
-    (
-        "land_use_drained_organic_soils",
-        "https://www.fao.org/faostat/en/#data/GV",
-        "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
-    ),
-    (
-        "pre_post_agricultural_production",
-        "https://www.fao.org/faostat/en/#data/GPP",
-        "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
-    ),
-]
+domains = {
+    "farm_gate_emissions_crops": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GCE",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GCE/GCE_e.pdf",
+    },
+    "farm_gate_livestock": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GLE",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GLE/GLE_e.pdf",
+    },
+    "farm_gate_agriculture_energy": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GN",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GN/GN_2023Oct_Final.pdf",
+    },
+    "land_use_forests": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GF",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GF/GF_e.pdf",
+    },
+    "land_use_fires": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GI",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GI/GI_e.pdf",
+    },
+    "land_use_drained_organic_soils": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GV",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GV/GV_e.pdf",
+    },
+    "pre_post_agricultural_production": {
+        "url_domain": "https://www.fao.org/faostat/en/#data/GPP",
+        "url_download": "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
+        "url_methodology": "https://files-faostat.fao.org/production/GPP/README_Methodological_Note.pdf",
+    },
+}
 
 
 def get_root_path(root_indicator: str = ".git"):

+ 0 - 0
tests/unit/test_download.py