5 months ago · d7fd2afe27
--- a/scripts/download_all_domains.py
+++ b/scripts/download_all_domains.py
@@ -9,13 +9,18 @@ from src.faostat_data_primap.download import (
 
				 from src.faostat_data_primap.helper.definitions import downloaded_data_path, sources
			
 
				 
			
 
				 
			
 
				-def download_all_domains(sources: list[tuple[str]]):
			
 
				+def download_all_domains(sources: list[tuple[str]]) -> list[str]:
			
 
				     """
			
 
				-    Download input files from a remote location
			
 
				+    Download and unpack all climate-related domains from the FAO stat website.
			
 
				+
			
 
				+    Extract the date when the data set was last updated and create a directory
			
 
				+    with the same name. Download the zip files for each domain if
			
 
				+    it does not already exist. Unpack the zip file and save in
			
 
				+    the same directory.
			
 
				 
			
 
				     Parameters
			
 
				     ----------
			
 
				-    download_path
			
 
				+    sources
			
 
				         Name of data set, url to domain overview,
			
 
				         and download url
			
 
				 
			
@@ -32,7 +37,6 @@ def download_all_domains(sources: list[tuple[str]]):
 
				     ) in sources:
			
 
				         soup = get_html_content(url)
			
 
				 
			
 
				-        # todo Remove url input
			
 
				         last_updated = get_last_updated_date(soup, url)
			
 
				 
			
 
				         if not downloaded_data_path.exists():
			
@@ -43,7 +47,6 @@ def download_all_domains(sources: list[tuple[str]]):
 
				             ds_path.mkdir()
			
 
				 
			
 
				         local_data_dir = ds_path / last_updated
			
 
				-
			
 
				         if not local_data_dir.exists():
			
 
				             local_data_dir.mkdir()
			
 
				 
			
--- a/src/faostat_data_primap/download.py
+++ b/src/faostat_data_primap/download.py
@@ -1,26 +1,27 @@
 
				 """Downloads data from FAOSTAT website."""
			
 
				 
			
 
				+import pathlib
			
 
				 import time
			
 
				 import zipfile
			
 
				 from datetime import datetime
			
 
				 
			
 
				+import bs4
			
 
				 import requests
			
 
				 from bs4 import BeautifulSoup
			
 
				-
			
 
				-# from helper.definitions import downloaded_data_path, root_path
			
 
				 from selenium import webdriver
			
 
				 from selenium.webdriver.chrome.service import Service
			
 
				 
			
 
				 from src.faostat_data_primap.exceptions import DateTagNotFoundError
			
 
				 
			
 
				 
			
 
				-def get_html_content(url):
			
 
				+def get_html_content(url: str) -> bs4.BeautifulSoup:
			
 
				     """
			
 
				     Get html from url.
			
 
				 
			
 
				     Parameters
			
 
				     ----------
			
 
				     url
			
 
				+        The url to the domain overview website.
			
 
				 
			
 
				     Returns
			
 
				     -------
			
@@ -28,9 +29,9 @@ def get_html_content(url):
 
				     -------
			
 
				 
			
 
				     """
			
 
				-    # If the driver isn't found on your system PATH, Selenium
			
 
				+    # If the chrome driver isn't found on your system PATH, Selenium
			
 
				     # will automatically download it for you. Make sure there is no
			
 
				-    # chromedriver installed on your system
			
 
				+    # chromedriver installed on your system.
			
 
				     service = Service()
			
 
				     driver = webdriver.Chrome(service=service)
			
 
				 
			
@@ -44,14 +45,17 @@ def get_html_content(url):
 
				     return BeautifulSoup(html_content, "html.parser")
			
 
				 
			
 
				 
			
 
				-def get_last_updated_date(soup, url):
			
 
				+def get_last_updated_date(soup: bs4.BeautifulSoup, url: str) -> str:
			
 
				     """
			
 
				     Get the date when data set way last updated from html text
			
 
				 
			
 
				     Parameters
			
 
				     ----------
			
 
				     soup
			
 
				+        The beautiful soup object with all html code of the domain
			
 
				+        overview page.
			
 
				     url
			
 
				+        The url to the domain overview page.
			
 
				 
			
 
				     Returns
			
 
				     -------
			
@@ -67,52 +71,66 @@ def get_last_updated_date(soup, url):
 
				     return last_updated
			
 
				 
			
 
				 
			
 
				-def download_file(url_download, save_path):
			
 
				+def download_file(url_download: str, save_path: pathlib.PosixPath):
			
 
				     """
			
 
				-    todo
			
 
				+    Download file.
			
 
				+
			
 
				+    If an existing file is found at this location, the download is skipped.
			
 
				 
			
 
				     Parameters
			
 
				     ----------
			
 
				     url_download
			
 
				+        Remote URL to download the file from
			
 
				     save_path
			
 
				+        Path to save the downloaded file to
			
 
				 
			
 
				     Returns
			
 
				     -------
			
 
				         True if the file was downloaded, False if a cached file was found
			
 
				     """
			
 
				     if not save_path.exists():
			
 
				-        response = requests.get(url_download, timeout=20)
			
 
				-        response.raise_for_status()
			
 
				+        with requests.get(url_download, stream=True, timeout=30) as response:
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            with open(save_path, mode="wb") as file:
			
 
				+                file.write(response.content)
			
 
				 
			
 
				-        # will overwrite existing file
			
 
				-        with open(save_path, mode="wb") as file:
			
 
				-            file.write(response.content)
			
 
				         return True
			
 
				     else:
			
 
				-        print(f"Skipping {save_path}" " because it already exists.")
			
 
				+        print(f"Skipping download of {save_path}" " because it already exists.")
			
 
				     return False
			
 
				 
			
 
				 
			
 
				-def unzip_file(local_filename):
			
 
				+def unzip_file(local_filename: pathlib.PosixPath):
			
 
				     """
			
 
				-    todo
			
 
				+    Unzip files in same directory. Skip if files are already there
			
 
				 
			
 
				     Parameters
			
 
				     ----------
			
 
				     local_filename
			
 
				+        Path to the zip file
			
 
				 
			
 
				     Returns
			
 
				     -------
			
 
				         List of unzipped files
			
 
				     """
			
 
				-    # unzip data (only for new downloads)
			
 
				+    unzipped_files = []
			
 
				     if local_filename.suffix == ".zip":
			
 
				         try:
			
 
				-            # TODO check if unzipped files already there
			
 
				-            zipped_file = zipfile.ZipFile(str(local_filename), "r")
			
 
				-            zipped_file.extractall(str(local_filename.parent))
			
 
				-            print(f"Extracted {len(zipped_file.namelist())} files.")
			
 
				-            zipped_file.close()
			
 
				+            with zipfile.ZipFile(str(local_filename), "r") as zip_file:
			
 
				+                for file_info in zip_file.infolist():
			
 
				+                    extracted_file_path = local_filename.parent / file_info.filename
			
 
				+
			
 
				+                    if extracted_file_path.exists():
			
 
				+                        print(
			
 
				+                            f"File '{file_info.filename}' already exists. "
			
 
				+                            f"Skipping extraction."
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        print(f"Extracting '{file_info.filename}'...")
			
 
				+                        zip_file.extract(file_info, local_filename.parent)
			
 
				+                        unzipped_files.append(local_filename)
			
 
				+
			
 
				         # TODO Better error logging/visibilty
			
 
				         except zipfile.BadZipFile:
			
 
				             print(f"Error while trying to extract " f"{local_filename}")
			
@@ -120,3 +138,4 @@ def unzip_file(local_filename):
 
				             print("Zip format not supported, " "please unzip on the command line.")
			
 
				     else:
			
 
				         print(f"Not attempting to extract " f"{local_filename}.")
			
 
				+    return unzipped_files