Browse Source

[DATALAD] Recorded changes

Daniel Busch 3 months ago
parent
commit
b3c09466f9

+ 80 - 48
src/faostat_data_primap/helper/category_aggregation.py

@@ -3,16 +3,12 @@ Definitions for category aggregation.
 """
 
 # Checking consistency of category tree in FAO categorisation
-# There are discrepancies of up to 100% due to rounding errors for small values
-# theoretical example, 0.0001 (rounded from 0.00006) + 0.0004 (rounded from 0.00036)
+# There are discrepancies of up to 100% due to rounding errors for small values,
+# for example, 0.0001 (rounded from 0.00006) + 0.0004 (rounded from 0.00036)
 # = 0.00042 which is then rounded to 0.0004, while the consistency check expects 0.0005
-# At the moment, we are only checking categories that will later be used by primap-hist.
-# If we want to use other categories we should expand this consistency check.
 agg_info_fao = {
     "category (FAO)": {
-        # 1.A.1 wheat
-        # rounding errors
-        "1.A.1.a": {
+        "1.A.1.a": {  # wheat
             "tolerance": 1,
             "sources": [
                 "1.A.1.a.i",
@@ -28,9 +24,7 @@ agg_info_fao = {
             ],
             "sel": {"variable": ["N2O", "CH4"]},
         },
-        # 1.A.2 rice
-        # rounding errors
-        "1.A.2.a": {
+        "1.A.2.a": {  # 1.A.2 rice
             "tolerance": 1,
             "sources": [
                 "1.A.2.a.i",
@@ -210,7 +204,7 @@ agg_info_fao = {
             "sel": {"variable": ["N2O"]},
         },
         "1.A": {
-            # some rounding errors for CH4
+            # crops
             "tolerance": 1,
             "sources": [
                 "1.A.1",
@@ -243,7 +237,7 @@ agg_info_fao = {
             ],
             "sel": {"variable": ["N2O"]},
         },
-        # Category 1 is not available on FAOS, so that's not a check
+        # Category 1 is not available on FAO, so that's not a check
         "1": {
             "tolerance": 0.01,
             "sources": [
@@ -268,7 +262,7 @@ agg_info_fao = {
                 "3.J",
                 "3.K",
                 "3.L",
-                # "3.M", # poultry is an aggregate of other categories I forgot to remove
+                # "3.M", # poultry is an aggregate of other categories, I forgot to remove
                 "3.N",
                 "3.O",
                 "3.P",
@@ -278,7 +272,7 @@ agg_info_fao = {
             "sel": {"variable": ["CH4", "N2O"]},
         },
         # Testing for one animal type to make sure the category tree makes sense
-        # TODO: We could do the same for each animal but that's a lot of effort
+        # TODO: We could do the same for each animal
         "3.C.3.b": {
             "tolerance": 1,
             "sources": [
@@ -355,7 +349,7 @@ agg_info_fao = {
             "sel": {"variable": ["CH4", "N2O", "CO2"]},
         },
         "6.B": {
-            # rounding errors, NLD looks problematic but hard to tell which value is right
+            # rounding errors
             "tolerance": 1,
             "sources": [
                 "6.B.1",
@@ -378,11 +372,13 @@ agg_info_fao = {
     }
 }
 
+# aggregating each gas separately to make this easier to understand
+# We can change it back to one dict once it's all organised
 agg_info_ipcc2006_primap_N2O = {
     "category (IPCC2006_PRIMAP)": {
         "3.C.1": {  # Emissions from Biomass Burning
             "sources": [
-                # "3.C.1.a",  # Biomass Burning In Forest Lands, because not included in 2023 release
+                # "3.C.1.a",  # leaving out "Biomass Burning In Forest Lands", because not included in 2023 release
                 "3.C.1.b",  # Biomass Burning In Croplands
                 "3.C.1.c",  # Biomass Burning in Grasslands
             ],
@@ -401,9 +397,9 @@ agg_info_ipcc2006_primap_N2O = {
                 "3.C.1.c",  # Biomass Burning in Grasslands - looks good (CH4)
                 "3.C.4",  # Direct N2O Emissions from Managed Soils
                 "M.3.C.4.SF",  # synthetic fertilisers direct
-                # "3.C.5",  # Indirect N2O Emissions from Managed Soils, empty
+                # "3.C.5",  # Indirect N2O Emissions from Managed Soils, currently empty
                 "M.3.C.5.SF",  # synthetic fertilisers indirect
-                # "3.C.6",  # Indirect N2O Emissions from Manure Management
+                # "3.C.6",  # Indirect N2O Emissions from Manure Management, currently empty
                 "3.C.7",  # rice cultivation
                 "3.B.2",  # Drained grassland, was in LULUCF orginally
                 "3.B.3",  # Drained cropland, was in LULUCF originally
@@ -416,43 +412,27 @@ agg_info_ipcc2006_primap_N2O = {
         },
         "3.C": {
             "sources": [
-                "M.3.C.1.AG",  # maybe better 3.C.1?
+                "M.3.C.1.AG",  # TODO 3.C.1 would be correct, but doesn't match 2023
                 "3.C.4",  # Direct N2O Emissions from Managed Soils
                 "M.3.C.4.SF",  # synthetic fertilisers direct
                 # "3.C.5",  # Indirect N2O Emissions from Managed Soils, empty
                 "M.3.C.5.SF",  # synthetic fertilisers indirect
-                # "3.C.6",  # Indirect N2O Emissions from Manure Management
+                # "3.C.6",  # Indirect N2O Emissions from Manure Management, empty
                 "3.C.7",  # rice cultivation
                 "3.B.2",  # Drained grassland, was in LULUCF orginally
                 "3.B.3",  # Drained cropland, was in LULUCF originally
             ],
             "sel": {"variable": ["N2O"]},
         },
-        "3.A.1.a": {  # enteric fermentation
-            "sources": [
-                "3.A.1.a.i",  # cattle (dairy)
-                "3.A.1.a.ii",  # cattle (non-dairy)
-            ]
-        },
-        "3.A.1": {  # enteric fermentation
-            "sources": [
-                "3.A.1.a",
-                "3.A.1.b",
-                "3.A.1.c",
-                "3.A.1.d",
-                "3.A.1.e",
-                "3.A.1.f",
-                "3.A.1.g",
-                "3.A.1.h",
-                "3.A.1.j",
-            ]
-        },
-        "3.A.2.a": {  # decomposition of manure - CH4, N2O
-            "sources": [
-                "3.A.2.a.i",  # cattle (dairy)
-                "3.A.2.a.ii",  # cattle (non-dairy)
-            ]
-        },
+        # TODO 3.A.2.x are currently not read in
+        # "3.A.2.a": {  # decomposition of manure - CH4, N2O
+        #     "sources": [
+        #         "3.A.2.a.i",  # cattle (dairy)
+        #         "3.A.2.a.ii",  # cattle (non-dairy)
+        #     ],
+        #     "sel": {"variable": ["N2O"]},
+        # },
+        # # consistency check
         # "3.A.2": {  # decomposition of manure - CH4, N2O
         #     "sources": [
         #         "3.A.2.a",
@@ -465,9 +445,13 @@ agg_info_ipcc2006_primap_N2O = {
         #         "3.A.2.h",
         #         "3.A.2.i",
         #         "3.A.2.j",
-        #     ]
+        #     ],
+        #     "sel": {"variable": ["N2O"]},
         # },
-        "3.A": {"sources": ["3.A.1", "3.A.2"]},
+        "3.A": {
+            "sources": ["3.A.1", "3.A.2"],
+            "sel": {"variable": ["N2O"]},
+        },
         "M.AG": {
             "sources": [
                 "3.A",
@@ -549,9 +533,57 @@ agg_info_ipcc2006_primap_CO2 = {
     }
 }
 
-
 agg_info_ipcc2006_primap_CH4 = {
     "category (IPCC2006_PRIMAP)": {
+        "3.A.1.a": {  # enteric fermentation
+            "sources": [
+                "3.A.1.a.i",  # cattle (dairy)
+                "3.A.1.a.ii",  # cattle (non-dairy)
+            ],
+            "sel": {"variable": ["CH4"]},
+        },
+        "3.A.1": {  # enteric fermentation
+            "sources": [
+                "3.A.1.a",
+                "3.A.1.b",
+                "3.A.1.c",
+                "3.A.1.d",
+                "3.A.1.e",
+                "3.A.1.f",
+                "3.A.1.g",
+                "3.A.1.h",
+                "3.A.1.j",
+            ],
+            "sel": {"variable": ["CH4"]},
+        },
+        # TODO 3.A.2.x are currently not read in
+        # "3.A.2.a": {  # decomposition of manure - CH4, N2O
+        #     "sources": [
+        #         "3.A.2.a.i",  # cattle (dairy)
+        #         "3.A.2.a.ii",  # cattle (non-dairy)
+        #     ],
+        #     "sel": {"variable": ["CH4"]},
+        # },
+        # # consistency check
+        # "3.A.2": {  # decomposition of manure - CH4, N2O
+        #     "sources": [
+        #         "3.A.2.a",
+        #         "3.A.2.b",
+        #         "3.A.2.c",
+        #         "3.A.2.d",
+        #         "3.A.2.e",
+        #         "3.A.2.f",
+        #         "3.A.2.g",
+        #         "3.A.2.h",
+        #         "3.A.2.i",
+        #         "3.A.2.j",
+        #     ],
+        #     "sel": {"variable": ["CH4"]},
+        # },
+        "3.A": {
+            "sources": ["3.A.1", "3.A.2"],
+            "sel": {"variable": ["CH4"]},
+        },
         "3.C.1": {  # Emissions from Biomass Burning
             "sources": [
                 # "3.C.1.a",  # Biomass Burning In Forest Lands, because not there in 2023 release

+ 103 - 3
src/faostat_data_primap/read.py

@@ -6,6 +6,8 @@ import pathlib
 import climate_categories as cc
 import pandas as pd
 import primap2 as pm2  # type: ignore
+import xarray
+import xarray as xr
 
 from faostat_data_primap.helper.country_mapping import country_to_iso3_mapping
 from faostat_data_primap.helper.definitions import (
@@ -18,6 +20,12 @@ from faostat_data_primap.helper.paths import (
     downloaded_data_path,
     extracted_data_path,
 )
+from src.faostat_data_primap.helper.category_aggregation import (
+    agg_info_fao,
+    agg_info_ipcc2006_primap_CH4,
+    agg_info_ipcc2006_primap_CO2,
+    agg_info_ipcc2006_primap_N2O,
+)
 
 
 def get_all_domains(downloaded_data_path: pathlib.Path) -> list[str]:
@@ -272,9 +280,101 @@ def read_data(  # noqa: PLR0915 PLR0912
     print(f"Writing netcdf file to {filepath}")
     data_pm2.pr.to_netcdf(filepath, encoding=encoding)
 
-    # next steps
-    # convert to IPCC2006_PRIMAP categories
-    # save final version
+
+def process(ds: xarray.Dataset):
+    """
+    Process dataset.
+
+    Perform the conversion from FAO to IPCC2006_PRIMAP categories
+    and aggregate categories.
+
+    Parameters
+    ----------
+    ds
+        The data set to preocess.
+
+    Returns
+    -------
+        The processed dataset
+
+    """
+    # make categorisation A from yaml
+    categorisation_a = cc.FAO
+    # make categorisation B from yaml
+    categorisation_b = cc.IPCC2006_PRIMAP
+
+    # category FAOSTAT not yet part of climate categories, so we need to add it manually
+    cats = {
+        "FAO": categorisation_a,
+        "IPCC2006_PRIMAP": categorisation_b,
+    }
+    # # release_name = "v2024-11-14"
+    # release_name = "v2023-12-13"
+    #
+    # # reproduce 2023 data set
+    reproduce23 = True
+    #
+    # ds_fao = (
+    #         extracted_data_path
+    #         # / "v2024-11-14/FAOSTAT_Agrifood_system_emissions_v2024-11-14_raw.nc"
+    #         / f"{release_name}/FAOSTAT_Agrifood_system_emissions_{release_name}_raw.nc"
+    # )
+    # ds = pm2.open_dataset(ds_fao)
+
+    # drop UNFCCC data
+    ds = ds.drop_sel(source="UNFCCC")
+
+    # consistency check in original categorisation
+    ds_checked = ds.pr.add_aggregates_coordinates(agg_info=agg_info_fao)  # noqa: F841
+    # ds_checked_if = ds_checked.pr.to_interchange_format()
+
+    # We need a conversion CSV file for each entity
+    # That's a temporary workaround until convert function can filter for data variables (entities)
+    conv = {}
+    gases = ["CO2", "CH4", "N2O"]
+
+    if reproduce23:
+        reproduce23_filename = "_reproduce23"
+    else:
+        reproduce23_filename = ""
+
+    for var in gases:
+        conv[var] = cc.Conversion.from_csv(
+            f"../../conversion_FAO_IPPCC2006_PRIMAP_{var}{reproduce23_filename}.csv",
+            cats=cats,
+        )
+
+    # convert for each entity
+    da_dict = {}
+    for var in gases:
+        da_dict[var] = ds[var].pr.convert(
+            dim="category (FAO)",
+            conversion=conv[var],
+        )
+    result = xr.Dataset(da_dict)
+    result.attrs = ds.attrs
+    result.attrs["cat"] = "category (IPCC2006_PRIMAP)"
+
+    # convert to interchange format and back to get rid of empty categories
+    # TODO there may be a better way to do this
+    result_if = result.pr.to_interchange_format()
+    result = pm2.pm2io.from_interchange_format(result_if)
+
+    # aggregation for each gas for better understanding
+    # TODO creates some duplicate code, we can combine maybe
+    result_proc = result.pr.add_aggregates_coordinates(
+        agg_info=agg_info_ipcc2006_primap_N2O
+    )
+
+    result_proc = result_proc.pr.add_aggregates_coordinates(
+        agg_info=agg_info_ipcc2006_primap_CO2
+    )
+
+    result_proc = result_proc.pr.add_aggregates_coordinates(
+        agg_info=agg_info_ipcc2006_primap_CH4
+    )
+
+    return result_proc
 
 
 def read_latest_data(

+ 13 - 98
tests/unit/test_conversion.py

@@ -12,114 +12,29 @@ from src.faostat_data_primap.helper.paths import (
     downloaded_data_path,
     extracted_data_path,
 )
-from src.faostat_data_primap.read import read_data
+from src.faostat_data_primap.read import process, read_data
 
 
-def test_conversion_from_FAO_to_IPCC2006_PRIMAP_output_equal():
-    # make categorisation A from yaml
-    categorisation_a = cc.FAO
-    # make categorisation B from yaml
-    categorisation_b = cc.IPCC2006_PRIMAP
-
-    # category FAOSTAT not yet part of climate categories, so we need to add it manually
-    cats = {
-        "FAO": categorisation_a,
-        "IPCC2006_PRIMAP": categorisation_b,
-    }
+def test_process_output_remains_the_same():
+    # get processed data
     # release_name = "v2024-11-14"
     release_name = "v2023-12-13"
+    filename_processed_ds = f"FAOSTAT_Agrifood_system_emissions_{release_name}"
+    filepath = extracted_data_path / release_name / (filename_processed_ds + ".nc")
+    ds_processed = pm2.open_dataset(filepath)
 
-    # reproduce 2023 data set
-    reproduce23 = True
-
-    ds_fao = (
+    # get raw data
+    filename_raw_ds = (
         extracted_data_path
-        # / "v2024-11-14/FAOSTAT_Agrifood_system_emissions_v2024-11-14_raw.nc"
         / f"{release_name}/FAOSTAT_Agrifood_system_emissions_{release_name}_raw.nc"
     )
-    ds = pm2.open_dataset(ds_fao)
+    ds_raw = pm2.open_dataset(filename_raw_ds)
 
-    # drop UNFCCC data
-    ds = ds.drop_sel(source="UNFCCC")
+    # process raw data
+    ds_processed_new = process(ds_raw)
 
-    # consistency check in original categorisation
-    ds_checked = ds.pr.add_aggregates_coordinates(agg_info=agg_info_fao)  # noqa: F841
-    # ds_checked_if = ds_checked.pr.to_interchange_format()
-
-    # We need a conversion CSV file for each entity
-    # That's a temporary workaround until convert function can filter for data variables (entities)
-    conv = {}
-    gases = ["CO2", "CH4", "N2O"]
-
-    if reproduce23:
-        reproduce23_filename = "_reproduce23"
-    else:
-        reproduce23_filename = ""
-
-    for var in gases:
-        conv[var] = cc.Conversion.from_csv(
-            f"../../conversion_FAO_IPPCC2006_PRIMAP_{var}{reproduce23_filename}.csv",
-            cats=cats,
-        )
-
-    # convert for each entity
-    da_dict = {}
-    for var in gases:
-        da_dict[var] = ds[var].pr.convert(
-            dim="category (FAO)",
-            conversion=conv[var],
-        )
-    result = xr.Dataset(da_dict)
-    result.attrs = ds.attrs
-    result.attrs["cat"] = "category (IPCC2006_PRIMAP)"
-
-    # convert to interchange format and back to get rid of empty categories
-    # TODO there may be a better way to do this
-    result_if = result.pr.to_interchange_format()
-    result = pm2.pm2io.from_interchange_format(result_if)
-
-    # aggregation for each gas for better understanding
-    # TODO creates some duplicate code, we can combine maybe
-    result_proc = result.pr.add_aggregates_coordinates(
-        agg_info=agg_info_ipcc2006_primap_N2O
-    )
-
-    result_proc = result_proc.pr.add_aggregates_coordinates(
-        agg_info=agg_info_ipcc2006_primap_CO2
-    )
-
-    result_proc = result_proc.pr.add_aggregates_coordinates(
-        agg_info=agg_info_ipcc2006_primap_CH4
-    )
-
-    # get processed data
-    output_filename = f"FAOSTAT_Agrifood_system_emissions_{release_name}"
-    output_folder = extracted_data_path / release_name
-    filepath = output_folder / (output_filename + ".nc")
-    ds_original = pm2.open_dataset(filepath)
-
-    # result_proc_if = result_proc.pr.to_interchange_format()
-
-    assert ds_original.broadcast_equals(result_proc)
-    # result_proc_if = result_proc.pr.to_interchange_format()
-    #
-    #
-    #
-    # if not output_folder.exists() :
-    #     output_folder.mkdir()
-    #
-    # filepath = output_folder / (output_filename + ".csv")
-    # print(f"Writing processed primap2 file to {filepath}")
-    # pm2.pm2io.write_interchange_format(
-    #     filepath,
-    #     result_proc_if,
-    # )
-    #
-    # compression = dict(zlib=True, complevel=9)
-    # encoding = {var : compression for var in result_proc.data_vars}
-    # filepath = output_folder / (output_filename + ".nc")
-    # print(f"Writing netcdf file to {filepath}")
-    # result_proc.pr.to_netcdf(filepath, encoding=encoding)
+    # compare
+    assert ds_processed.broadcast_equals(ds_processed_new)
 
 
 def test_read(tmp_path):