Explorar o código

integration test for read script

Daniel Busch hai 4 meses
pai
achega
340152f497
Modificáronse 5 ficheiros con 73 adicións e 13 borrados
  1. 27 1
      poetry.lock
  2. 2 0
      pyproject.toml
  3. 6 0
      requirements.txt
  4. 10 12
      src/faostat_data_primap/read.py
  5. 28 0
      tests/integration/test_read_script.py

+ 27 - 1
poetry.lock

@@ -2158,6 +2158,21 @@ sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-d
 test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
 xml = ["lxml (>=4.9.2)"]
 
+[[package]]
+name = "pandas-stubs"
+version = "2.2.3.241009"
+description = "Type annotations for pandas"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "pandas_stubs-2.2.3.241009-py3-none-any.whl", hash = "sha256:3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa"},
+    {file = "pandas_stubs-2.2.3.241009.tar.gz", hash = "sha256:d4ab618253f0acf78a5d0d2bfd6dffdd92d91a56a69bdc8144e5a5c6d25be3b5"},
+]
+
+[package.dependencies]
+numpy = ">=1.23.5"
+types-pytz = ">=2022.1.1"
+
 [[package]]
 name = "parso"
 version = "0.8.4"
@@ -3739,6 +3754,17 @@ files = [
     {file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"},
 ]
 
+[[package]]
+name = "types-pytz"
+version = "2024.2.0.20241003"
+description = "Typing stubs for pytz"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-pytz-2024.2.0.20241003.tar.gz", hash = "sha256:575dc38f385a922a212bac00a7d6d2e16e141132a3c955078f4a4fd13ed6cb44"},
+    {file = "types_pytz-2024.2.0.20241003-py3-none-any.whl", hash = "sha256:3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7"},
+]
+
 [[package]]
 name = "types-requests"
 version = "2.32.0.20241016"
@@ -3933,4 +3959,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "2fd97730190d0fa167efdcba5bd1414d5d48e148b70fb5be5e2cf601fe3c5cb2"
+content-hash = "ba984aa6c4d56e62124fc30db68f1f9f7140a6fe95b7f50d471b30bbb9009f22"

+ 2 - 0
pyproject.toml

@@ -18,6 +18,7 @@ types-requests = "^2.32.0.20241016"
 pandas = "^2.2.3"
 pycountry = "^24.6.1"
 primap2 = "^0.11.2"
+pandas-stubs = "^2.2.3.241009"
 
 
 [tool.poetry.group.tests.dependencies]
@@ -202,6 +203,7 @@ authorized_licenses = [
     "python software foundation",
     "python software foundation license",
     "zpl 2.1",
+    'CMU License (MIT-CMU)',
 ]
 # This starting list is relatively conservative. Depending on the project, it
 # may make sense to move some of these into the authorized list

+ 6 - 0
requirements.txt

@@ -664,6 +664,9 @@ outcome==1.3.0.post0 ; python_version >= "3.10" and python_version < "3.13" \
 packaging==24.2 ; python_version >= "3.10" and python_version < "3.13" \
     --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \
     --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f
+pandas-stubs==2.2.3.241009 ; python_version >= "3.10" and python_version < "3.13" \
+    --hash=sha256:3a6f8f142105a42550be677ba741ba532621f4e0acad2155c0e7b2450f114cfa \
+    --hash=sha256:d4ab618253f0acf78a5d0d2bfd6dffdd92d91a56a69bdc8144e5a5c6d25be3b5
 pandas==2.2.3 ; python_version >= "3.10" and python_version < "3.13" \
     --hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \
     --hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \
@@ -932,6 +935,9 @@ types-beautifulsoup4==4.12.0.20241020 ; python_version >= "3.10" and python_vers
 types-html5lib==1.1.11.20241018 ; python_version >= "3.10" and python_version < "3.13" \
     --hash=sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403 \
     --hash=sha256:98042555ff78d9e3a51c77c918b1041acbb7eb6c405408d8a9e150ff5beccafa
+types-pytz==2024.2.0.20241003 ; python_version >= "3.10" and python_version < "3.13" \
+    --hash=sha256:3e22df1336c0c6ad1d29163c8fda82736909eb977281cb823c57f8bae07118b7 \
+    --hash=sha256:575dc38f385a922a212bac00a7d6d2e16e141132a3c955078f4a4fd13ed6cb44
 types-requests==2.32.0.20241016 ; python_version >= "3.10" and python_version < "3.13" \
     --hash=sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95 \
     --hash=sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747

+ 10 - 12
src/faostat_data_primap/read.py

@@ -38,7 +38,7 @@ def get_all_domains(downloaded_data_path: pathlib.Path) -> list[str]:
     ]
 
 
-def get_latest_release(domain_path) -> str:
+def get_latest_release(domain_path: pathlib.Path) -> str:
     """
     Get the latest release in a domain directory.
 
@@ -60,7 +60,10 @@ def get_latest_release(domain_path) -> str:
     return sorted(all_releases, reverse=True)[0]
 
 
-def read_latest_data() -> None:
+def read_latest_data(
+    downloaded_data_path: pathlib.Path = downloaded_data_path,
+    save_path: pathlib.Path = extracted_data_path,
+) -> None:
     """
     Read and save the latest data
 
@@ -120,15 +123,6 @@ def read_latest_data() -> None:
         if df_all is None:
             df_all = df_domain
         else:
-            # makes sure there are no duplicate category names
-            if any(
-                [
-                    category in df_all["category"].unique()
-                    for category in df_domain["category"].unique()
-                ]
-            ):
-                msg = f"Duplicate category names for {domain}"
-                raise ValueError(msg)
             df_all = pd.concat(
                 [df_all, df_domain],
                 axis=0,
@@ -173,7 +167,7 @@ def read_latest_data() -> None:
     if not extracted_data_path.exists():
         extracted_data_path.mkdir()
 
-    output_folder = extracted_data_path / release_name
+    output_folder = save_path / release_name
     if not output_folder.exists():
         output_folder.mkdir()
 
@@ -184,3 +178,7 @@ def read_latest_data() -> None:
     compression = dict(zlib=True, complevel=9)
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(output_folder / (output_filename + ".nc"), encoding=encoding)
+
+    # next steps
+    # convert to IPCC2006_PRIMAP categories
+    # save final version

+ 28 - 0
tests/integration/test_read_script.py

@@ -0,0 +1,28 @@
+import os
+
+from src.faostat_data_primap.helper.paths import root_path
+from src.faostat_data_primap.read import read_latest_data
+
+
+def test_read_latest_data(tmp_path):
+    # get the downloaded data from here
+    downloaded_data_path = root_path / "downloaded_data"
+
+    # read and save latest data
+    read_latest_data(downloaded_data_path=downloaded_data_path, save_path=tmp_path)
+
+    release_folder = os.listdir(tmp_path)
+
+    # there should be one directory created
+    assert len(release_folder) == 1
+    # and it starts with "v" (the date changes with each release)
+    assert release_folder[0].startswith("v")
+
+    output_files = os.listdir(tmp_path / release_folder[0])
+    # in the folder there should be three files
+    assert len(output_files) == 3
+
+    # a .yaml, .csv, and .nc file
+    required_extensions = {"nc", "csv", "yaml"}
+    file_extensions = {file.split(".")[-1] for file in output_files}
+    assert required_extensions == file_extensions