|
@@ -11,9 +11,11 @@ import bs4
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
from selenium import webdriver
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
|
|
|
from faostat_data_primap.exceptions import DateTagNotFoundError
|
|
|
+from faostat_data_primap.helper.definitions import domains, downloaded_data_path
|
|
|
|
|
|
|
|
|
def find_previous_release_path(
|
|
@@ -172,7 +174,9 @@ def get_html_content(url: str) -> bs4.BeautifulSoup:
|
|
|
|
|
|
|
|
|
service = Service()
|
|
|
- driver = webdriver.Chrome(service=service)
|
|
|
+ options = Options()
|
|
|
+ options.add_argument("--headless")
|
|
|
+ driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
|
|
driver.get(url)
|
|
|
|
|
@@ -288,3 +292,60 @@ def unzip_file(local_filename: pathlib.PosixPath):
|
|
|
else:
|
|
|
print(f"Not attempting to extract " f"{local_filename}.")
|
|
|
return unzipped_files
|
|
|
+
|
|
|
+
|
|
|
+def download_all_domains(
|
|
|
+ domains: list[tuple[str]] = domains,
|
|
|
+ downloaded_data_path: str = downloaded_data_path,
|
|
|
+) -> list[str]:
|
|
|
+ """
|
|
|
+ Download and unpack all climate-related domains from the FAO stat website.
|
|
|
+
|
|
|
+ Extract the date when the data set was last updated and create a directory
|
|
|
+ with the same name. Download the zip files for each domain if
|
|
|
+ it does not already exist. Unpack the zip file and save in
|
|
|
+ the same directory.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ sources
|
|
|
+ Name of data set, url to domain overview,
|
|
|
+ and download url
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ List of input files that have been fetched or found locally.
|
|
|
+
|
|
|
+ """
|
|
|
+ downloaded_files = []
|
|
|
+ for ds_name, urls in domains.items():
|
|
|
+ url = urls["url_domain"]
|
|
|
+ url_download = urls["url_download"]
|
|
|
+ url_methodology = urls["url_methodology"]
|
|
|
+
|
|
|
+ soup = get_html_content(url)
|
|
|
+
|
|
|
+ last_updated = get_last_updated_date(soup, url)
|
|
|
+
|
|
|
+ if not downloaded_data_path.exists():
|
|
|
+ downloaded_data_path.mkdir()
|
|
|
+
|
|
|
+ ds_path = downloaded_data_path / ds_name
|
|
|
+ if not ds_path.exists():
|
|
|
+ ds_path.mkdir()
|
|
|
+
|
|
|
+ local_data_dir = ds_path / last_updated
|
|
|
+ if not local_data_dir.exists():
|
|
|
+ local_data_dir.mkdir()
|
|
|
+
|
|
|
+ download_methodology(save_path=local_data_dir, url_download=url_methodology)
|
|
|
+
|
|
|
+ local_filename = local_data_dir / f"{ds_name}.zip"
|
|
|
+
|
|
|
+ download_file(url_download=url_download, save_path=local_filename)
|
|
|
+
|
|
|
+ downloaded_files.append(str(local_filename))
|
|
|
+
|
|
|
+ unzip_file(local_filename)
|
|
|
+
|
|
|
+ return downloaded_files
|