Daniel Busch 5 months ago
parent
commit
0a52d2ec21

+ 0 - 1
downloaded_data/farm_gate_agriculture_energy/2023-12-13/farm_gate_agriculture_energy.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/4m/p0/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip/MD5E-s1131872--3a3329f2115c62bab08ba71183623db7.zip

+ 0 - 1
downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip

@@ -1 +0,0 @@
-../../../.git/annex/objects/mG/4x/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip

+ 1 - 0
downloaded_data/farm_gate_emissions_crops/2023-11-09/farm_gate_emissions_crops.zip

@@ -0,0 +1 @@
+/annex/objects/MD5E-s5025708--d6b35891c494f61bb1699f669611a959.zip

+ 102 - 0
scripts/download_all_domains.py

@@ -0,0 +1,102 @@
+"""Downloads all domain data sets from FAOSTAT website."""
+
+import zipfile
+
+import requests
+
+from src.faostat_data_primap.download import get_html_content, get_last_updated_date
+from src.faostat_data_primap.helper.definitions import downloaded_data_path, root_path
+
+if __name__ == "__main__":
+    sources = [
+        (
+            "farm_gate_emissions_crops",
+            "https://www.fao.org/faostat/en/#data/GCE",
+            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
+        ),
+        (
+            "farm_gate_livestock",
+            "https://www.fao.org/faostat/en/#data/GLE",
+            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
+        ),
+        (
+            "farm_gate_agriculture_energy",
+            "https://www.fao.org/faostat/en/#data/GN",
+            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
+        ),
+        (
+            "land_use_forests",
+            "https://www.fao.org/faostat/en/#data/GF",
+            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
+        ),
+        (
+            "land_use_fires",
+            "https://www.fao.org/faostat/en/#data/GI",
+            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
+        ),
+        (
+            "land_use_drained_organic_soils",
+            "https://www.fao.org/faostat/en/#data/GV",
+            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
+        ),
+        (
+            "pre_post_agricultural_production",
+            "https://www.fao.org/faostat/en/#data/GPP",
+            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
+        ),
+    ]
+    for (
+        ds_name,
+        url,
+        url_download,
+    ) in sources:
+        soup = get_html_content(url)
+
+        last_updated = get_last_updated_date(soup, url)
+
+        if not downloaded_data_path.exists():
+            downloaded_data_path.mkdir()
+
+        ds_path = downloaded_data_path / ds_name
+        if not ds_path.exists():
+            ds_path.mkdir()
+
+        local_data_dir = ds_path / last_updated
+
+        if not local_data_dir.exists():
+            local_data_dir.mkdir()
+
+        local_filename = local_data_dir / f"{ds_name}.zip"
+
+        response = requests.get(url_download, timeout=20)
+        response.raise_for_status()
+
+        # will overwrite existing file
+        with open(local_filename, mode="wb") as file:
+            file.write(response.content)
+
+        if local_filename.exists():
+            print(f"Download => {local_filename.relative_to(root_path)}")
+            # unzip data (only for new downloads)
+            if local_filename.suffix == ".zip":
+                try:
+                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
+                    zipped_file.extractall(str(local_filename.parent))
+                    print(f"Extracted {len(zipped_file.namelist())} files.")
+                    zipped_file.close()
+                    # os.remove(local_filename)
+                # TODO Better error logging/visibilty
+                except zipfile.BadZipFile:
+                    print(
+                        f"Error while trying to extract "
+                        f"{local_filename.relative_to(root_path)}"
+                    )
+                except NotImplementedError:
+                    print(
+                        "Zip format not supported, " "please unzip on the command line."
+                    )
+            else:
+                print(
+                    f"Not attempting to extract "
+                    f"{local_filename.relative_to(root_path)}."
+                )

+ 51 - 128
src/faostat_data_primap/download.py

@@ -1,142 +1,65 @@
 """Downloads data from FAOSTAT website."""
 
 import time
-import zipfile
 from datetime import datetime
 
-import datalad.api
 from bs4 import BeautifulSoup
-from helper.definitions import downloaded_data_path, root_path
+
+# from helper.definitions import downloaded_data_path, root_path
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 
+from src.faostat_data_primap.exceptions import DateTagNotFoundError
+
+
+def get_html_content(url):
+    """
+    Get html from url.
+
+    Parameters
+    ----------
+    url
+
+    Returns
+    -------
+        html content
+    -------
 
-class DateTagNotFoundError(Exception):
     """
-    The date when the data set was last updated could not be found
+    # If the driver isn't found on your system PATH, Selenium
+    # will automatically download it for you. Make sure there is no
+    # chromedriver installed on your system
+    service = Service()
+    driver = webdriver.Chrome(service=service)
+
+    driver.get(url)
+
+    # give time to load javascript
+    time.sleep(3)
+
+    html_content = driver.page_source
+
+    return BeautifulSoup(html_content, "html.parser")
+
+
+def get_last_updated_date(soup, url):
+    """
+    Get the date when data set way last updated from html text
+
+    Parameters
+    ----------
+    soup
+    url
+
+    Returns
+    -------
+        date when data set was last updated
     """
+    date_tag = soup.find("p", {"data-role": "date"})
 
+    if not date_tag:
+        raise DateTagNotFoundError(url=url)
 
-def __init__(
-    self, message="The <p> tag with data-role='date' was not found on the page."
-):
-    super().__init__(message)
-
-
-if __name__ == "__main__":
-    sources = [
-        (
-            "farm_gate_emissions_crops",
-            "https://www.fao.org/faostat/en/#data/GCE",
-            "https://bulks-faostat.fao.org/production/Emissions_crops_E_All_Data.zip",
-        ),
-        (
-            "farm_gate_livestock",
-            "https://www.fao.org/faostat/en/#data/GLE",
-            "https://bulks-faostat.fao.org/production/Emissions_livestock_E_All_Data.zip",
-        ),
-        (
-            "farm_gate_agriculture_energy",
-            "https://www.fao.org/faostat/en/#data/GN",
-            "https://bulks-faostat.fao.org/production/Emissions_Agriculture_Energy_E_All_Data.zip",
-        ),
-        (
-            "land_use_forests",
-            "https://www.fao.org/faostat/en/#data/GF",
-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Forests_E_All_Data.zip",
-        ),
-        (
-            "land_use_fires",
-            "https://www.fao.org/faostat/en/#data/GI",
-            "https://bulks-faostat.fao.org/production/Emissions_Land_Use_Fires_E_All_Data.zip",
-        ),
-        (
-            "land_use_drained_organic_soils",
-            "https://www.fao.org/faostat/en/#data/GV",
-            "https://bulks-faostat.fao.org/production/Emissions_Drained_Organic_Soils_E_All_Data.zip",
-        ),
-        (
-            "pre_post_agricultural_production",
-            "https://www.fao.org/faostat/en/#data/GPP",
-            "https://bulks-faostat.fao.org/production/Emissions_Pre_Post_Production_E_All_Data.zip",
-        ),
-    ]
-    for (
-        ds_name,
-        url,
-        url_download,
-    ) in sources:
-        # If the driver isn't found on your system PATH, Selenium
-        # will automatically download it for you. Make sure there is no
-        # chromedriver installed on your system
-        service = Service()
-        driver = webdriver.Chrome(service=service)
-
-        driver.get(url)
-
-        # give time to load javascript
-        time.sleep(3)
-
-        html_content = driver.page_source
-
-        soup = BeautifulSoup(html_content, "html.parser")
-
-        date_tag = soup.find("p", {"data-role": "date"})
-
-        if not date_tag:
-            msg = "The <p> tag with data-role='date' was not found on the page."
-            raise DateTagNotFoundError(msg)
-
-        last_updated = date_tag.get_text()
-
-        # make downloaded_data folder if it doesn't exist yet
-        if not downloaded_data_path.exists():
-            downloaded_data_path.mkdir()
-
-        # make data set folder if it doesn't exist yet
-        ds_path = downloaded_data_path / ds_name
-        if not ds_path.exists():
-            ds_path.mkdir()
-
-        # create unique directory
-        last_updated_iso = datetime.strptime(last_updated, "%B %d, %Y").strftime(
-            "%Y-%m-%d"
-        )
-        local_data_dir = ds_path / last_updated_iso
-
-        if not local_data_dir.exists():
-            local_data_dir.mkdir()
-
-        # download and commit with datalad
-        local_filename = local_data_dir / f"{ds_name}.zip"
-        datalad.api.download_url(
-            urls=url_download,
-            message=f"Added {ds_name}",
-            path=str(local_filename),
-        )
-
-        if local_filename.exists():
-            print(f"Download => {local_filename.relative_to(root_path)}")
-            # unzip data (only for new downloads)
-            if local_filename.suffix == ".zip":
-                try:
-                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
-                    zipped_file.extractall(str(local_filename.parent))
-                    print(f"Extracted {len(zipped_file.namelist())} files.")
-                    zipped_file.close()
-                    # os.remove(local_filename)
-                # TODO Better error logging/visibilty
-                except zipfile.BadZipFile:
-                    print(
-                        f"Error while trying to extract "
-                        f"{local_filename.relative_to(root_path)}"
-                    )
-                except NotImplementedError:
-                    print(
-                        "Zip format not supported, " "please unzip on the command line."
-                    )
-            else:
-                print(
-                    f"Not attempting to extract "
-                    f"{local_filename.relative_to(root_path)}."
-                )
+    last_updated = date_tag.get_text()
+    last_updated = datetime.strptime(last_updated, "%B %d, %Y").strftime("%Y-%m-%d")
+    return last_updated

+ 23 - 0
src/faostat_data_primap/exceptions.py

@@ -0,0 +1,23 @@
+"""Exceptions"""
+
+
+class DateTagNotFoundError(Exception):
+    """
+    Raised when date for latest update cannot be found on FAO domain website
+    """
+
+
+def __init__(
+    self,
+    url: "str",
+) -> None:
+    """
+    Initialise the error
+
+    Parameters
+    ----------
+    url
+        Link to download domain page
+    """
+    msg = f"Tag for date lat updated was not found on page with url {url}."
+    super().__init__(msg)