hai 4 meses · 340152f497
--- a/poetry.lock
+++ b/poetry.lock
@@ -2158,6 +2158,21 @@ sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-d
 
				 test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
			
 
				 xml = ["lxml (>=4.9.2)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "pandas-stubs"
			
 
				+version = "2.2.3.241009"
			
 
				+description = "Type annotations for pandas"
			
 
				+optional = false
			
 
				+python-versions = ">=3.10"
			
 
				+files = [
			
 
				+    {file = "pandas_stubs-2.2.3.241009-py3-none-any.whl", hash = "sha256:3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa"},
			
 
				+    {file = "pandas_stubs-2.2.3.241009.tar.gz", hash = "sha256:d4ab618253f0acf78a5d0d2bfd6dffdd92d91a56a69bdc8144e5a5c6d25be3b5"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+numpy = ">=1.23.5"
			
 
				+types-pytz = ">=2022.1.1"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "parso"
			
 
				 version = "0.8.4"
			
@@ -3739,6 +3754,17 @@ files = [
 
				     {file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"},
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "types-pytz"
			
 
				+version = "2024.2.0.20241003"
			
 
				+description = "Typing stubs for pytz"
			
 
				+optional = false
			
 
				+python-versions = ">=3.8"
			
 
				+files = [
			
 
				+    {file = "types-pytz-2024.2.0.20241003.tar.gz", hash = "sha256:575dc38f385a922a212bac00a7d6d2e16e141132a3c955078f4a4fd13ed6cb44"},
			
 
				+    {file = "types_pytz-2024.2.0.20241003-py3-none-any.whl", hash = "sha256:3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "types-requests"
			
 
				 version = "2.32.0.20241016"
			
@@ -3933,4 +3959,4 @@ type = ["pytest-mypy"]
 
				 [metadata]
			
 
				 lock-version = "2.0"
			
 
				 python-versions = ">=3.10,<3.13"
			
 
				-content-hash = "2fd97730190d0fa167efdcba5bd1414d5d48e148b70fb5be5e2cf601fe3c5cb2"
			
 
				+content-hash = "ba984aa6c4d56e62124fc30db68f1f9f7140a6fe95b7f50d471b30bbb9009f22"
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ types-requests = "^2.32.0.20241016"
 
				 pandas = "^2.2.3"
			
 
				 pycountry = "^24.6.1"
			
 
				 primap2 = "^0.11.2"
			
 
				+pandas-stubs = "^2.2.3.241009"
			
 
				 
			
 
				 
			
 
				 [tool.poetry.group.tests.dependencies]
			
@@ -202,6 +203,7 @@ authorized_licenses = [
 
				     "python software foundation",
			
 
				     "python software foundation license",
			
 
				     "zpl 2.1",
			
 
				+    'CMU License (MIT-CMU)',
			
 
				 ]
			
 
				 # This starting list is relatively conservative. Depending on the project, it
			
 
				 # may make sense to move some of these into the authorized list
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -664,6 +664,9 @@ outcome==1.3.0.post0 ; python_version >= "3.10" and python_version < "3.13" \
 
				 packaging==24.2 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				     --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \
			
 
				     --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f
			
 
				+pandas-stubs==2.2.3.241009 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				+    --hash=sha256:3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa \
			
 
				+    --hash=sha256:d4ab618253f0acf78a5d0d2bfd6dffdd92d91a56a69bdc8144e5a5c6d25be3b5
			
 
				 pandas==2.2.3 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				     --hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \
			
 
				     --hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \
			
@@ -932,6 +935,9 @@ types-beautifulsoup4==4.12.0.20241020 ; python_version >= "3.10" and python_vers
 
				 types-html5lib==1.1.11.20241018 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				     --hash=sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403 \
			
 
				     --hash=sha256:98042555ff78d9e3a51c77c918b1041acbb7eb6c405408d8a9e150ff5beccafa
			
 
				+types-pytz==2024.2.0.20241003 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				+    --hash=sha256:3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7 \
			
 
				+    --hash=sha256:575dc38f385a922a212bac00a7d6d2e16e141132a3c955078f4a4fd13ed6cb44
			
 
				 types-requests==2.32.0.20241016 ; python_version >= "3.10" and python_version < "3.13" \
			
 
				     --hash=sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95 \
			
 
				     --hash=sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747
			
--- a/src/faostat_data_primap/read.py
+++ b/src/faostat_data_primap/read.py
@@ -38,7 +38,7 @@ def get_all_domains(downloaded_data_path: pathlib.Path) -> list[str]:
 
				     ]
			
 
				 
			
 
				 
			
 
				-def get_latest_release(domain_path) -> str:
			
 
				+def get_latest_release(domain_path: pathlib.Path) -> str:
			
 
				     """
			
 
				     Get the latest release in a domain directory.
			
 
				 
			
@@ -60,7 +60,10 @@ def get_latest_release(domain_path) -> str:
 
				     return sorted(all_releases, reverse=True)[0]
			
 
				 
			
 
				 
			
 
				-def read_latest_data() -> None:
			
 
				+def read_latest_data(
			
 
				+    downloaded_data_path: pathlib.Path = downloaded_data_path,
			
 
				+    save_path: pathlib.Path = extracted_data_path,
			
 
				+) -> None:
			
 
				     """
			
 
				     Read and save the latest data
			
 
				 
			
@@ -120,15 +123,6 @@ def read_latest_data() -> None:
 
				         if df_all is None:
			
 
				             df_all = df_domain
			
 
				         else:
			
 
				-            # makes sure there are no duplicate category names
			
 
				-            if any(
			
 
				-                [
			
 
				-                    category in df_all["category"].unique()
			
 
				-                    for category in df_domain["category"].unique()
			
 
				-                ]
			
 
				-            ):
			
 
				-                msg = f"Duplicate category names for {domain}"
			
 
				-                raise ValueError(msg)
			
 
				             df_all = pd.concat(
			
 
				                 [df_all, df_domain],
			
 
				                 axis=0,
			
@@ -173,7 +167,7 @@ def read_latest_data() -> None:
 
				     if not extracted_data_path.exists():
			
 
				         extracted_data_path.mkdir()
			
 
				 
			
 
				-    output_folder = extracted_data_path / release_name
			
 
				+    output_folder = save_path / release_name
			
 
				     if not output_folder.exists():
			
 
				         output_folder.mkdir()
			
 
				 
			
@@ -184,3 +178,7 @@ def read_latest_data() -> None:
 
				     compression = dict(zlib=True, complevel=9)
			
 
				     encoding = {var: compression for var in data_pm2.data_vars}
			
 
				     data_pm2.pr.to_netcdf(output_folder / (output_filename + ".nc"), encoding=encoding)
			
 
				+
			
 
				+    # next steps
			
 
				+    # convert to IPCC2006_PRIMAP categories
			
 
				+    # save final version
			
--- a/tests/integration/test_read_script.py
+++ b/tests/integration/test_read_script.py
@@ -0,0 +1,28 @@
 
				+import os
			
 
				+
			
 
				+from src.faostat_data_primap.helper.paths import root_path
			
 
				+from src.faostat_data_primap.read import read_latest_data
			
 
				+
			
 
				+
			
 
				+def test_read_latest_data(tmp_path):
			
 
				+    # get the downloaded data from here
			
 
				+    downloaded_data_path = root_path / "downloaded_data"
			
 
				+
			
 
				+    # read and save latest data
			
 
				+    read_latest_data(downloaded_data_path=downloaded_data_path, save_path=tmp_path)
			
 
				+
			
 
				+    release_folder = os.listdir(tmp_path)
			
 
				+
			
 
				+    # there should be one directory created
			
 
				+    assert len(release_folder) == 1
			
 
				+    # and it starts with "v" (the date changes with each release)
			
 
				+    assert release_folder[0].startswith("v")
			
 
				+
			
 
				+    output_files = os.listdir(tmp_path / release_folder[0])
			
 
				+    # in the folder there should be three files
			
 
				+    assert len(output_files) == 3
			
 
				+
			
 
				+    # a .yaml, .csv, and .nc file
			
 
				+    required_extensions = {"nc", "csv", "yaml"}
			
 
				+    file_extensions = {file.split(".")[-1] for file in output_files}
			
 
				+    assert required_extensions == file_extensions