3 months ago · b3c09466f9
--- a/src/faostat_data_primap/helper/category_aggregation.py
+++ b/src/faostat_data_primap/helper/category_aggregation.py
@@ -3,16 +3,12 @@ Definitions for category aggregation.
 
				 """
			
 
				 
			
 
				 # Checking consistency of category tree in FAO categorisation
			
 
				-# There are discrepancies of up to 100% due to rounding errors for small values
			
 
				-# theoretical example, 0.0001 (rounded from 0.00006) + 0.0004 (rounded from 0.00036)
			
 
				+# There are discrepancies of up to 100% due to rounding errors for small values,
			
 
				+# for example, 0.0001 (rounded from 0.00006) + 0.0004 (rounded from 0.00036)
			
 
				 # = 0.00042 which is then rounded to 0.0004, while the consistency check expects 0.0005
			
 
				-# At the moment, we are only checking categories that will later be used by primap-hist.
			
 
				-# If we want to use other categories we should expand this consistency check.
			
 
				 agg_info_fao = {
			
 
				     "category (FAO)": {
			
 
				-        # 1.A.1 wheat
			
 
				-        # rounding errors
			
 
				-        "1.A.1.a": {
			
 
				+        "1.A.1.a": {  # wheat
			
 
				             "tolerance": 1,
			
 
				             "sources": [
			
 
				                 "1.A.1.a.i",
			
@@ -28,9 +24,7 @@ agg_info_fao = {
 
				             ],
			
 
				             "sel": {"variable": ["N2O", "CH4"]},
			
 
				         },
			
 
				-        # 1.A.2 rice
			
 
				-        # rounding errors
			
 
				-        "1.A.2.a": {
			
 
				+        "1.A.2.a": {  # 1.A.2 rice
			
 
				             "tolerance": 1,
			
 
				             "sources": [
			
 
				                 "1.A.2.a.i",
			
@@ -210,7 +204,7 @@ agg_info_fao = {
 
				             "sel": {"variable": ["N2O"]},
			
 
				         },
			
 
				         "1.A": {
			
 
				-            # some rounding errors for CH4
			
 
				+            # crops
			
 
				             "tolerance": 1,
			
 
				             "sources": [
			
 
				                 "1.A.1",
			
@@ -243,7 +237,7 @@ agg_info_fao = {
 
				             ],
			
 
				             "sel": {"variable": ["N2O"]},
			
 
				         },
			
 
				-        # Category 1 is not available on FAOS, so that's not a check
			
 
				+        # Category 1 is not available on FAO, so that's not a check
			
 
				         "1": {
			
 
				             "tolerance": 0.01,
			
 
				             "sources": [
			
@@ -268,7 +262,7 @@ agg_info_fao = {
 
				                 "3.J",
			
 
				                 "3.K",
			
 
				                 "3.L",
			
 
				-                # "3.M", # poultry is an aggregate of other categories I forgot to remove
			
 
				+                # "3.M", # poultry is an aggregate of other categories, I forgot to remove
			
 
				                 "3.N",
			
 
				                 "3.O",
			
 
				                 "3.P",
			
@@ -278,7 +272,7 @@ agg_info_fao = {
 
				             "sel": {"variable": ["CH4", "N2O"]},
			
 
				         },
			
 
				         # Testing for one animal type to make sure the category tree makes sense
			
 
				-        # TODO: We could do the same for each animal but that's a lot of effort
			
 
				+        # TODO: We could do the same for each animal
			
 
				         "3.C.3.b": {
			
 
				             "tolerance": 1,
			
 
				             "sources": [
			
@@ -355,7 +349,7 @@ agg_info_fao = {
 
				             "sel": {"variable": ["CH4", "N2O", "CO2"]},
			
 
				         },
			
 
				         "6.B": {
			
 
				-            # rounding errors, NLD looks problematic but hard to tell which value is right
			
 
				+            # rounding errors
			
 
				             "tolerance": 1,
			
 
				             "sources": [
			
 
				                 "6.B.1",
			
@@ -378,11 +372,13 @@ agg_info_fao = {
 
				     }
			
 
				 }
			
 
				 
			
 
				+# aggregating each gas separately to make this easier to understand
			
 
				+# We can change it back to one dict once it's all organised
			
 
				 agg_info_ipcc2006_primap_N2O = {
			
 
				     "category (IPCC2006_PRIMAP)": {
			
 
				         "3.C.1": {  # Emissions from Biomass Burning
			
 
				             "sources": [
			
 
				-                # "3.C.1.a",  # Biomass Burning In Forest Lands, because not included in 2023 release
			
 
				+                # "3.C.1.a",  # leaving out "Biomass Burning In Forest Lands", because not included in 2023 release
			
 
				                 "3.C.1.b",  # Biomass Burning In Croplands
			
 
				                 "3.C.1.c",  # Biomass Burning in Grasslands
			
 
				             ],
			
@@ -401,9 +397,9 @@ agg_info_ipcc2006_primap_N2O = {
 
				                 "3.C.1.c",  # Biomass Burning in Grasslands - looks good (CH4)
			
 
				                 "3.C.4",  # Direct N2O Emissions from Managed Soils
			
 
				                 "M.3.C.4.SF",  # synthetic fertilisers direct
			
 
				-                # "3.C.5",  # Indirect N2O Emissions from Managed Soils, empty
			
 
				+                # "3.C.5",  # Indirect N2O Emissions from Managed Soils, currently empty
			
 
				                 "M.3.C.5.SF",  # synthetic fertilisers indirect
			
 
				-                # "3.C.6",  # Indirect N2O Emissions from Manure Management
			
 
				+                # "3.C.6",  # Indirect N2O Emissions from Manure Management, currently empty
			
 
				                 "3.C.7",  # rice cultivation
			
 
				                 "3.B.2",  # Drained grassland, was in LULUCF orginally
			
 
				                 "3.B.3",  # Drained cropland, was in LULUCF originally
			
@@ -416,43 +412,27 @@ agg_info_ipcc2006_primap_N2O = {
 
				         },
			
 
				         "3.C": {
			
 
				             "sources": [
			
 
				-                "M.3.C.1.AG",  # maybe better 3.C.1?
			
 
				+                "M.3.C.1.AG",  # TODO 3.C.1 would be correct, but doesn't match 2023
			
 
				                 "3.C.4",  # Direct N2O Emissions from Managed Soils
			
 
				                 "M.3.C.4.SF",  # synthetic fertilisers direct
			
 
				                 # "3.C.5",  # Indirect N2O Emissions from Managed Soils, empty
			
 
				                 "M.3.C.5.SF",  # synthetic fertilisers indirect
			
 
				-                # "3.C.6",  # Indirect N2O Emissions from Manure Management
			
 
				+                # "3.C.6",  # Indirect N2O Emissions from Manure Management, empty
			
 
				                 "3.C.7",  # rice cultivation
			
 
				                 "3.B.2",  # Drained grassland, was in LULUCF orginally
			
 
				                 "3.B.3",  # Drained cropland, was in LULUCF originally
			
 
				             ],
			
 
				             "sel": {"variable": ["N2O"]},
			
 
				         },
			
 
				-        "3.A.1.a": {  # enteric fermentation
			
 
				-            "sources": [
			
 
				-                "3.A.1.a.i",  # cattle (dairy)
			
 
				-                "3.A.1.a.ii",  # cattle (non-dairy)
			
 
				-            ]
			
 
				-        },
			
 
				-        "3.A.1": {  # enteric fermentation
			
 
				-            "sources": [
			
 
				-                "3.A.1.a",
			
 
				-                "3.A.1.b",
			
 
				-                "3.A.1.c",
			
 
				-                "3.A.1.d",
			
 
				-                "3.A.1.e",
			
 
				-                "3.A.1.f",
			
 
				-                "3.A.1.g",
			
 
				-                "3.A.1.h",
			
 
				-                "3.A.1.j",
			
 
				-            ]
			
 
				-        },
			
 
				-        "3.A.2.a": {  # decomposition of manure - CH4, N2O
			
 
				-            "sources": [
			
 
				-                "3.A.2.a.i",  # cattle (dairy)
			
 
				-                "3.A.2.a.ii",  # cattle (non-dairy)
			
 
				-            ]
			
 
				-        },
			
 
				+        # TODO 3.A.2.x are currently not read in
			
 
				+        # "3.A.2.a": {  # decomposition of manure - CH4, N2O
			
 
				+        #     "sources": [
			
 
				+        #         "3.A.2.a.i",  # cattle (dairy)
			
 
				+        #         "3.A.2.a.ii",  # cattle (non-dairy)
			
 
				+        #     ],
			
 
				+        #     "sel": {"variable": ["N2O"]},
			
 
				+        # },
			
 
				+        # # consistency check
			
 
				         # "3.A.2": {  # decomposition of manure - CH4, N2O
			
 
				         #     "sources": [
			
 
				         #         "3.A.2.a",
			
@@ -465,9 +445,13 @@ agg_info_ipcc2006_primap_N2O = {
 
				         #         "3.A.2.h",
			
 
				         #         "3.A.2.i",
			
 
				         #         "3.A.2.j",
			
 
				-        #     ]
			
 
				+        #     ],
			
 
				+        #     "sel": {"variable": ["N2O"]},
			
 
				         # },
			
 
				-        "3.A": {"sources": ["3.A.1", "3.A.2"]},
			
 
				+        "3.A": {
			
 
				+            "sources": ["3.A.1", "3.A.2"],
			
 
				+            "sel": {"variable": ["N2O"]},
			
 
				+        },
			
 
				         "M.AG": {
			
 
				             "sources": [
			
 
				                 "3.A",
			
@@ -549,9 +533,57 @@ agg_info_ipcc2006_primap_CO2 = {
 
				     }
			
 
				 }
			
 
				 
			
 
				-
			
 
				 agg_info_ipcc2006_primap_CH4 = {
			
 
				     "category (IPCC2006_PRIMAP)": {
			
 
				+        "3.A.1.a": {  # enteric fermentation
			
 
				+            "sources": [
			
 
				+                "3.A.1.a.i",  # cattle (dairy)
			
 
				+                "3.A.1.a.ii",  # cattle (non-dairy)
			
 
				+            ],
			
 
				+            "sel": {"variable": ["CH4"]},
			
 
				+        },
			
 
				+        "3.A.1": {  # enteric fermentation
			
 
				+            "sources": [
			
 
				+                "3.A.1.a",
			
 
				+                "3.A.1.b",
			
 
				+                "3.A.1.c",
			
 
				+                "3.A.1.d",
			
 
				+                "3.A.1.e",
			
 
				+                "3.A.1.f",
			
 
				+                "3.A.1.g",
			
 
				+                "3.A.1.h",
			
 
				+                "3.A.1.j",
			
 
				+            ],
			
 
				+            "sel": {"variable": ["CH4"]},
			
 
				+        },
			
 
				+        # TODO 3.A.2.x are currently not read in
			
 
				+        # "3.A.2.a": {  # decomposition of manure - CH4, N2O
			
 
				+        #     "sources": [
			
 
				+        #         "3.A.2.a.i",  # cattle (dairy)
			
 
				+        #         "3.A.2.a.ii",  # cattle (non-dairy)
			
 
				+        #     ],
			
 
				+        #     "sel": {"variable": ["CH4"]},
			
 
				+        # },
			
 
				+        # # consistency check
			
 
				+        # "3.A.2": {  # decomposition of manure - CH4, N2O
			
 
				+        #     "sources": [
			
 
				+        #         "3.A.2.a",
			
 
				+        #         "3.A.2.b",
			
 
				+        #         "3.A.2.c",
			
 
				+        #         "3.A.2.d",
			
 
				+        #         "3.A.2.e",
			
 
				+        #         "3.A.2.f",
			
 
				+        #         "3.A.2.g",
			
 
				+        #         "3.A.2.h",
			
 
				+        #         "3.A.2.i",
			
 
				+        #         "3.A.2.j",
			
 
				+        #     ],
			
 
				+        #     "sel": {"variable": ["CH4"]},
			
 
				+        # },
			
 
				+        "3.A": {
			
 
				+            "sources": ["3.A.1", "3.A.2"],
			
 
				+            "sel": {"variable": ["CH4"]},
			
 
				+        },
			
 
				         "3.C.1": {  # Emissions from Biomass Burning
			
 
				             "sources": [
			
 
				                 # "3.C.1.a",  # Biomass Burning In Forest Lands, because not there in 2023 release
			
--- a/src/faostat_data_primap/read.py
+++ b/src/faostat_data_primap/read.py
@@ -6,6 +6,8 @@ import pathlib
 
				 import climate_categories as cc
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2  # type: ignore
			
 
				+import xarray
			
 
				+import xarray as xr
			
 
				 
			
 
				 from faostat_data_primap.helper.country_mapping import country_to_iso3_mapping
			
 
				 from faostat_data_primap.helper.definitions import (
			
@@ -18,6 +20,12 @@ from faostat_data_primap.helper.paths import (
 
				     downloaded_data_path,
			
 
				     extracted_data_path,
			
 
				 )
			
 
				+from src.faostat_data_primap.helper.category_aggregation import (
			
 
				+    agg_info_fao,
			
 
				+    agg_info_ipcc2006_primap_CH4,
			
 
				+    agg_info_ipcc2006_primap_CO2,
			
 
				+    agg_info_ipcc2006_primap_N2O,
			
 
				+)
			
 
				 
			
 
				 
			
 
				 def get_all_domains(downloaded_data_path: pathlib.Path) -> list[str]:
			
@@ -272,9 +280,101 @@ def read_data(  # noqa: PLR0915 PLR0912
 
				     print(f"Writing netcdf file to {filepath}")
			
 
				     data_pm2.pr.to_netcdf(filepath, encoding=encoding)
			
 
				 
			
 
				-    # next steps
			
 
				-    # convert to IPCC2006_PRIMAP categories
			
 
				-    # save final version
			
 
				+
			
 
				+def process(ds: xarray.Dataset):
			
 
				+    """
			
 
				+    Process dataset.
			
 
				+
			
 
				+    Perform the conversion from FAO to IPCC2006_PRIMAP categories
			
 
				+    and aggregate categories.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    ds
			
 
				+        The data set to preocess.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        The processed dataset
			
 
				+
			
 
				+    """
			
 
				+    # make categorisation A from yaml
			
 
				+    categorisation_a = cc.FAO
			
 
				+    # make categorisation B from yaml
			
 
				+    categorisation_b = cc.IPCC2006_PRIMAP
			
 
				+
			
 
				+    # category FAOSTAT not yet part of climate categories, so we need to add it manually
			
 
				+    cats = {
			
 
				+        "FAO": categorisation_a,
			
 
				+        "IPCC2006_PRIMAP": categorisation_b,
			
 
				+    }
			
 
				+    # # release_name = "v2024-11-14"
			
 
				+    # release_name = "v2023-12-13"
			
 
				+    #
			
 
				+    # # reproduce 2023 data set
			
 
				+    reproduce23 = True
			
 
				+    #
			
 
				+    # ds_fao = (
			
 
				+    #         extracted_data_path
			
 
				+    #         # / "v2024-11-14/FAOSTAT_Agrifood_system_emissions_v2024-11-14_raw.nc"
			
 
				+    #         / f"{release_name}/FAOSTAT_Agrifood_system_emissions_{release_name}_raw.nc"
			
 
				+    # )
			
 
				+    # ds = pm2.open_dataset(ds_fao)
			
 
				+
			
 
				+    # drop UNFCCC data
			
 
				+    ds = ds.drop_sel(source="UNFCCC")
			
 
				+
			
 
				+    # consistency check in original categorisation
			
 
				+    ds_checked = ds.pr.add_aggregates_coordinates(agg_info=agg_info_fao)  # noqa: F841
			
 
				+    # ds_checked_if = ds_checked.pr.to_interchange_format()
			
 
				+
			
 
				+    # We need a conversion CSV file for each entity
			
 
				+    # That's a temporary workaround until convert function can filter for data variables (entities)
			
 
				+    conv = {}
			
 
				+    gases = ["CO2", "CH4", "N2O"]
			
 
				+
			
 
				+    if reproduce23:
			
 
				+        reproduce23_filename = "_reproduce23"
			
 
				+    else:
			
 
				+        reproduce23_filename = ""
			
 
				+
			
 
				+    for var in gases:
			
 
				+        conv[var] = cc.Conversion.from_csv(
			
 
				+            f"../../conversion_FAO_IPPCC2006_PRIMAP_{var}{reproduce23_filename}.csv",
			
 
				+            cats=cats,
			
 
				+        )
			
 
				+
			
 
				+    # convert for each entity
			
 
				+    da_dict = {}
			
 
				+    for var in gases:
			
 
				+        da_dict[var] = ds[var].pr.convert(
			
 
				+            dim="category (FAO)",
			
 
				+            conversion=conv[var],
			
 
				+        )
			
 
				+    result = xr.Dataset(da_dict)
			
 
				+    result.attrs = ds.attrs
			
 
				+    result.attrs["cat"] = "category (IPCC2006_PRIMAP)"
			
 
				+
			
 
				+    # convert to interchange format and back to get rid of empty categories
			
 
				+    # TODO there may be a better way to do this
			
 
				+    result_if = result.pr.to_interchange_format()
			
 
				+    result = pm2.pm2io.from_interchange_format(result_if)
			
 
				+
			
 
				+    # aggregation for each gas for better understanding
			
 
				+    # TODO creates some duplicate code, we can combine maybe
			
 
				+    result_proc = result.pr.add_aggregates_coordinates(
			
 
				+        agg_info=agg_info_ipcc2006_primap_N2O
			
 
				+    )
			
 
				+
			
 
				+    result_proc = result_proc.pr.add_aggregates_coordinates(
			
 
				+        agg_info=agg_info_ipcc2006_primap_CO2
			
 
				+    )
			
 
				+
			
 
				+    result_proc = result_proc.pr.add_aggregates_coordinates(
			
 
				+        agg_info=agg_info_ipcc2006_primap_CH4
			
 
				+    )
			
 
				+
			
 
				+    return result_proc
			
 
				 
			
 
				 
			
 
				 def read_latest_data(
			
--- a/tests/unit/test_conversion.py
+++ b/tests/unit/test_conversion.py
@@ -12,114 +12,29 @@ from src.faostat_data_primap.helper.paths import (
 
				     downloaded_data_path,
			
 
				     extracted_data_path,
			
 
				 )
			
 
				-from src.faostat_data_primap.read import read_data
			
 
				+from src.faostat_data_primap.read import process, read_data
			
 
				 
			
 
				 
			
 
				-def test_conversion_from_FAO_to_IPCC2006_PRIMAP_output_equal():
			
 
				-    # make categorisation A from yaml
			
 
				-    categorisation_a = cc.FAO
			
 
				-    # make categorisation B from yaml
			
 
				-    categorisation_b = cc.IPCC2006_PRIMAP
			
 
				-
			
 
				-    # category FAOSTAT not yet part of climate categories, so we need to add it manually
			
 
				-    cats = {
			
 
				-        "FAO": categorisation_a,
			
 
				-        "IPCC2006_PRIMAP": categorisation_b,
			
 
				-    }
			
 
				+def test_process_output_remains_the_same():
			
 
				+    # get processed data
			
 
				     # release_name = "v2024-11-14"
			
 
				     release_name = "v2023-12-13"
			
 
				+    filename_processed_ds = f"FAOSTAT_Agrifood_system_emissions_{release_name}"
			
 
				+    filepath = extracted_data_path / release_name / (filename_processed_ds + ".nc")
			
 
				+    ds_processed = pm2.open_dataset(filepath)
			
 
				 
			
 
				-    # reproduce 2023 data set
			
 
				-    reproduce23 = True
			
 
				-
			
 
				-    ds_fao = (
			
 
				+    # get raw data
			
 
				+    filename_raw_ds = (
			
 
				         extracted_data_path
			
 
				-        # / "v2024-11-14/FAOSTAT_Agrifood_system_emissions_v2024-11-14_raw.nc"
			
 
				         / f"{release_name}/FAOSTAT_Agrifood_system_emissions_{release_name}_raw.nc"
			
 
				     )
			
 
				-    ds = pm2.open_dataset(ds_fao)
			
 
				+    ds_raw = pm2.open_dataset(filename_raw_ds)
			
 
				 
			
 
				-    # drop UNFCCC data
			
 
				-    ds = ds.drop_sel(source="UNFCCC")
			
 
				+    # process raw data
			
 
				+    ds_processed_new = process(ds_raw)
			
 
				 
			
 
				-    # consistency check in original categorisation
			
 
				-    ds_checked = ds.pr.add_aggregates_coordinates(agg_info=agg_info_fao)  # noqa: F841
			
 
				-    # ds_checked_if = ds_checked.pr.to_interchange_format()
			
 
				-
			
 
				-    # We need a conversion CSV file for each entity
			
 
				-    # That's a temporary workaround until convert function can filter for data variables (entities)
			
 
				-    conv = {}
			
 
				-    gases = ["CO2", "CH4", "N2O"]
			
 
				-
			
 
				-    if reproduce23:
			
 
				-        reproduce23_filename = "_reproduce23"
			
 
				-    else:
			
 
				-        reproduce23_filename = ""
			
 
				-
			
 
				-    for var in gases:
			
 
				-        conv[var] = cc.Conversion.from_csv(
			
 
				-            f"../../conversion_FAO_IPPCC2006_PRIMAP_{var}{reproduce23_filename}.csv",
			
 
				-            cats=cats,
			
 
				-        )
			
 
				-
			
 
				-    # convert for each entity
			
 
				-    da_dict = {}
			
 
				-    for var in gases:
			
 
				-        da_dict[var] = ds[var].pr.convert(
			
 
				-            dim="category (FAO)",
			
 
				-            conversion=conv[var],
			
 
				-        )
			
 
				-    result = xr.Dataset(da_dict)
			
 
				-    result.attrs = ds.attrs
			
 
				-    result.attrs["cat"] = "category (IPCC2006_PRIMAP)"
			
 
				-
			
 
				-    # convert to interchange format and back to get rid of empty categories
			
 
				-    # TODO there may be a better way to do this
			
 
				-    result_if = result.pr.to_interchange_format()
			
 
				-    result = pm2.pm2io.from_interchange_format(result_if)
			
 
				-
			
 
				-    # aggregation for each gas for better understanding
			
 
				-    # TODO creates some duplicate code, we can combine maybe
			
 
				-    result_proc = result.pr.add_aggregates_coordinates(
			
 
				-        agg_info=agg_info_ipcc2006_primap_N2O
			
 
				-    )
			
 
				-
			
 
				-    result_proc = result_proc.pr.add_aggregates_coordinates(
			
 
				-        agg_info=agg_info_ipcc2006_primap_CO2
			
 
				-    )
			
 
				-
			
 
				-    result_proc = result_proc.pr.add_aggregates_coordinates(
			
 
				-        agg_info=agg_info_ipcc2006_primap_CH4
			
 
				-    )
			
 
				-
			
 
				-    # get processed data
			
 
				-    output_filename = f"FAOSTAT_Agrifood_system_emissions_{release_name}"
			
 
				-    output_folder = extracted_data_path / release_name
			
 
				-    filepath = output_folder / (output_filename + ".nc")
			
 
				-    ds_original = pm2.open_dataset(filepath)
			
 
				-
			
 
				-    # result_proc_if = result_proc.pr.to_interchange_format()
			
 
				-
			
 
				-    assert ds_original.broadcast_equals(result_proc)
			
 
				-    # result_proc_if = result_proc.pr.to_interchange_format()
			
 
				-    #
			
 
				-    #
			
 
				-    #
			
 
				-    # if not output_folder.exists() :
			
 
				-    #     output_folder.mkdir()
			
 
				-    #
			
 
				-    # filepath = output_folder / (output_filename + ".csv")
			
 
				-    # print(f"Writing processed primap2 file to {filepath}")
			
 
				-    # pm2.pm2io.write_interchange_format(
			
 
				-    #     filepath,
			
 
				-    #     result_proc_if,
			
 
				-    # )
			
 
				-    #
			
 
				-    # compression = dict(zlib=True, complevel=9)
			
 
				-    # encoding = {var : compression for var in result_proc.data_vars}
			
 
				-    # filepath = output_folder / (output_filename + ".nc")
			
 
				-    # print(f"Writing netcdf file to {filepath}")
			
 
				-    # result_proc.pr.to_netcdf(filepath, encoding=encoding)
			
 
				+    # compare
			
 
				+    assert ds_processed.broadcast_equals(ds_processed_new)
			
 
				 
			
 
				 
			
 
				 def test_read(tmp_path):