5 months ago · 0a52d2ec21
--- a/downloaded_data/farm_gate_agriculture_energy/2023-12-13/farm_gate_agriculture_energy.zip
+++ b/downloaded_data/farm_gate_agriculture_energy/2023-12-13/farm_gate_agriculture_energy.zip
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/4m/p0/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip
			
--- a/downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip
+++ b/downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/mG/4x/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip
			
--- a/downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip
+++ b/downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip
			
--- a/scripts/download_all_domains.py
+++ b/scripts/download_all_domains.py
@@ -0,0 +1,102 @@
 
				+"""Downloads all domain data sets from FAOSTAT website."""
			
 
				+
			
 
				+import zipfile
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+from src.faostat_data_primap.download import get_html_content, get_last_updated_date
			
 
				+from src.faostat_data_primap.helper.definitions import downloaded_data_path, root_path
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sources = [
			
 
				+        (
			
 
				+            "farm_gate_emissions_crops",
			
 
				+            "https://www.fao.org/faostat/en/#data/GCE",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "farm_gate_livestock",
			
 
				+            "https://www.fao.org/faostat/en/#data/GLE",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "farm_gate_agriculture_energy",
			
 
				+            "https://www.fao.org/faostat/en/#data/GN",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "land_use_forests",
			
 
				+            "https://www.fao.org/faostat/en/#data/GF",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "land_use_fires",
			
 
				+            "https://www.fao.org/faostat/en/#data/GI",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "land_use_drained_organic_soils",
			
 
				+            "https://www.fao.org/faostat/en/#data/GV",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				+        ),
			
 
				+        (
			
 
				+            "pre_post_agricultural_production",
			
 
				+            "https://www.fao.org/faostat/en/#data/GPP",
			
 
				+            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				+        ),
			
 
				+    ]
			
 
				+    for (
			
 
				+        ds_name,
			
 
				+        url,
			
 
				+        url_download,
			
 
				+    ) in sources:
			
 
				+        soup = get_html_content(url)
			
 
				+
			
 
				+        last_updated = get_last_updated_date(soup, url)
			
 
				+
			
 
				+        if not downloaded_data_path.exists():
			
 
				+            downloaded_data_path.mkdir()
			
 
				+
			
 
				+        ds_path = downloaded_data_path / ds_name
			
 
				+        if not ds_path.exists():
			
 
				+            ds_path.mkdir()
			
 
				+
			
 
				+        local_data_dir = ds_path / last_updated
			
 
				+
			
 
				+        if not local_data_dir.exists():
			
 
				+            local_data_dir.mkdir()
			
 
				+
			
 
				+        local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				+
			
 
				+        response = requests.get(url_download, timeout=20)
			
 
				+        response.raise_for_status()
			
 
				+
			
 
				+        # will overwrite existing file
			
 
				+        with open(local_filename, mode="wb") as file:
			
 
				+            file.write(response.content)
			
 
				+
			
 
				+        if local_filename.exists():
			
 
				+            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				+            # unzip data (only for new downloads)
			
 
				+            if local_filename.suffix == ".zip":
			
 
				+                try:
			
 
				+                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
			
 
				+                    zipped_file.extractall(str(local_filename.parent))
			
 
				+                    print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				+                    zipped_file.close()
			
 
				+                    # os.remove(local_filename)
			
 
				+                # TODO Better error logging/visibilty
			
 
				+                except zipfile.BadZipFile:
			
 
				+                    print(
			
 
				+                        f"Error while trying to extract "
			
 
				+                        f"{local_filename.relative_to(root_path)}"
			
 
				+                    )
			
 
				+                except NotImplementedError:
			
 
				+                    print(
			
 
				+                        "Zip format not supported, " "please unzip on the command line."
			
 
				+                    )
			
 
				+            else:
			
 
				+                print(
			
 
				+                    f"Not attempting to extract "
			
 
				+                    f"{local_filename.relative_to(root_path)}."
			
 
				+                )
			
--- a/src/faostat_data_primap/download.py
+++ b/src/faostat_data_primap/download.py
@@ -1,142 +1,65 @@
 
				 """Downloads data from FAOSTAT website."""
			
 
				 
			
 
				 import time
			
 
				-import zipfile
			
 
				 from datetime import datetime
			
 
				 
			
 
				-import datalad.api
			
 
				 from bs4 import BeautifulSoup
			
 
				-from helper.definitions import downloaded_data_path, root_path
			
 
				+
			
 
				+# from helper.definitions import downloaded_data_path, root_path
			
 
				 from selenium import webdriver
			
 
				 from selenium.webdriver.chrome.service import Service
			
 
				 
			
 
				+from src.faostat_data_primap.exceptions import DateTagNotFoundError
			
 
				+
			
 
				+
			
 
				+def get_html_content(url):
			
 
				+    """
			
 
				+    Get html from url.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    url
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        html content
			
 
				+    -------
			
 
				 
			
 
				-class DateTagNotFoundError(Exception):
			
 
				     """
			
 
				-    The date when the data set was last updated could not be found
			
 
				+    # If the driver isn't found on your system PATH, Selenium
			
 
				+    # will automatically download it for you. Make sure there is no
			
 
				+    # chromedriver installed on your system
			
 
				+    service = Service()
			
 
				+    driver = webdriver.Chrome(service=service)
			
 
				+
			
 
				+    driver.get(url)
			
 
				+
			
 
				+    # give time to load javascript
			
 
				+    time.sleep(3)
			
 
				+
			
 
				+    html_content = driver.page_source
			
 
				+
			
 
				+    return BeautifulSoup(html_content, "html.parser")
			
 
				+
			
 
				+
			
 
				+def get_last_updated_date(soup, url):
			
 
				+    """
			
 
				+    Get the date when data set way last updated from html text
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    soup
			
 
				+    url
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        date when data set was last updated
			
 
				     """
			
 
				+    date_tag = soup.find("p", {"data-role": "date"})
			
 
				 
			
 
				+    if not date_tag:
			
 
				+        raise DateTagNotFoundError(url=url)
			
 
				 
			
 
				-def __init__(
			
 
				-    self, message="The <p> tag with data-role='date' was not found on the page."
			
 
				-):
			
 
				-    super().__init__(message)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    sources = [
			
 
				-        (
			
 
				-            "farm_gate_emissions_crops",
			
 
				-            "https://www.fao.org/faostat/en/#data/GCE",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "farm_gate_livestock",
			
 
				-            "https://www.fao.org/faostat/en/#data/GLE",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "farm_gate_agriculture_energy",
			
 
				-            "https://www.fao.org/faostat/en/#data/GN",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_forests",
			
 
				-            "https://www.fao.org/faostat/en/#data/GF",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_fires",
			
 
				-            "https://www.fao.org/faostat/en/#data/GI",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "land_use_drained_organic_soils",
			
 
				-            "https://www.fao.org/faostat/en/#data/GV",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
			
 
				-        ),
			
 
				-        (
			
 
				-            "pre_post_agricultural_production",
			
 
				-            "https://www.fao.org/faostat/en/#data/GPP",
			
 
				-            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
			
 
				-        ),
			
 
				-    ]
			
 
				-    for (
			
 
				-        ds_name,
			
 
				-        url,
			
 
				-        url_download,
			
 
				-    ) in sources:
			
 
				-        # If the driver isn't found on your system PATH, Selenium
			
 
				-        # will automatically download it for you. Make sure there is no
			
 
				-        # chromedriver installed on your system
			
 
				-        service = Service()
			
 
				-        driver = webdriver.Chrome(service=service)
			
 
				-
			
 
				-        driver.get(url)
			
 
				-
			
 
				-        # give time to load javascript
			
 
				-        time.sleep(3)
			
 
				-
			
 
				-        html_content = driver.page_source
			
 
				-
			
 
				-        soup = BeautifulSoup(html_content, "html.parser")
			
 
				-
			
 
				-        date_tag = soup.find("p", {"data-role": "date"})
			
 
				-
			
 
				-        if not date_tag:
			
 
				-            msg = "The <p> tag with data-role='date' was not found on the page."
			
 
				-            raise DateTagNotFoundError(msg)
			
 
				-
			
 
				-        last_updated = date_tag.get_text()
			
 
				-
			
 
				-        # make downloaded_data folder if it doesn't exist yet
			
 
				-        if not downloaded_data_path.exists():
			
 
				-            downloaded_data_path.mkdir()
			
 
				-
			
 
				-        # make data set folder if it doesn't exist yet
			
 
				-        ds_path = downloaded_data_path / ds_name
			
 
				-        if not ds_path.exists():
			
 
				-            ds_path.mkdir()
			
 
				-
			
 
				-        # create unique directory
			
 
				-        last_updated_iso = datetime.strptime(last_updated, "%B %d, %Y").strftime(
			
 
				-            "%Y-%m-%d"
			
 
				-        )
			
 
				-        local_data_dir = ds_path / last_updated_iso
			
 
				-
			
 
				-        if not local_data_dir.exists():
			
 
				-            local_data_dir.mkdir()
			
 
				-
			
 
				-        # download and commit with datalad
			
 
				-        local_filename = local_data_dir / f"{ds_name}.zip"
			
 
				-        datalad.api.download_url(
			
 
				-            urls=url_download,
			
 
				-            message=f"Added {ds_name}",
			
 
				-            path=str(local_filename),
			
 
				-        )
			
 
				-
			
 
				-        if local_filename.exists():
			
 
				-            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				-            # unzip data (only for new downloads)
			
 
				-            if local_filename.suffix == ".zip":
			
 
				-                try:
			
 
				-                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
			
 
				-                    zipped_file.extractall(str(local_filename.parent))
			
 
				-                    print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				-                    zipped_file.close()
			
 
				-                    # os.remove(local_filename)
			
 
				-                # TODO Better error logging/visibilty
			
 
				-                except zipfile.BadZipFile:
			
 
				-                    print(
			
 
				-                        f"Error while trying to extract "
			
 
				-                        f"{local_filename.relative_to(root_path)}"
			
 
				-                    )
			
 
				-                except NotImplementedError:
			
 
				-                    print(
			
 
				-                        "Zip format not supported, " "please unzip on the command line."
			
 
				-                    )
			
 
				-            else:
			
 
				-                print(
			
 
				-                    f"Not attempting to extract "
			
 
				-                    f"{local_filename.relative_to(root_path)}."
			
 
				-                )
			
 
				+    last_updated = date_tag.get_text()
			
 
				+    last_updated = datetime.strptime(last_updated, "%B %d, %Y").strftime("%Y-%m-%d")
			
 
				+    return last_updated
			
--- a/src/faostat_data_primap/exceptions.py
+++ b/src/faostat_data_primap/exceptions.py
@@ -0,0 +1,23 @@
 
				+"""Exceptions"""
			
 
				+
			
 
				+
			
 
				+class DateTagNotFoundError(Exception):
			
 
				+    """
			
 
				+    Raised when date for latest update cannot be found on FAO domain website
			
 
				+    """
			
 
				+
			
 
				+
			
 
				+def __init__(
			
 
				+    self,
			
 
				+    url: "str",
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Initialise the error
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    url
			
 
				+        Link to download domain page
			
 
				+    """
			
 
				+    msg = f"Tag for date lat updated was not found on page with url {url}."
			
 
				+    super().__init__(msg)
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/4m/p0/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/mG/4x/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip`