2 weeks ago · 9e82d8d059
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 
				 # temporary
			
 
				 src/unfccc_ghg_data/datasets
			
 
				-
			
 
				+data
			
 
				 # logging
			
 
				 log/*
			
 
				 geckodriver.log
			
--- a/src/unfccc_ghg_data/unfccc_crf_reader/crf_raw_for_year_sparse_arrays.py
+++ b/src/unfccc_ghg_data/unfccc_crf_reader/crf_raw_for_year_sparse_arrays.py
@@ -20,6 +20,7 @@ import numpy as np
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				 import sparse
			
 
				+import xarray as xr
			
 
				 
			
 
				 from unfccc_ghg_data.helper import all_countries
			
 
				 from unfccc_ghg_data.unfccc_crf_reader.unfccc_crf_reader_prod import (
			
@@ -356,3 +357,94 @@ def crf_raw_for_year_pandas(
 
				         f"The following countries have updated submission not yet read "
			
 
				         f"and not included in the dataset: {outdated_countries}"
			
 
				     )
			
 
				+
			
 
				+
			
 
				+# source primap hist noch dazu, und select nach Land, Gas
			
 
				+# data tree aus allen Ländern
			
 
				+# prototyp für data tree merge, welche dimensionen in tree und welche in sets
			
 
				+# sum und category aggregation ausprobieren
			
 
				+# gas baskets
			
 
				+# künstliches beispiel, category im data tree
			
 
				+# wie würde man über Länder summieren wenn die Länder im dt sind
			
 
				+
			
 
				+
			
 
				+def crf_raw_for_year_datatree(  # noqa: PLR0912
			
 
				+    submission_year: int,
			
 
				+    output_folder: Path,
			
 
				+    submission_type: str = "CRF",
			
 
				+    n_countries: int = 5,
			
 
				+):
			
 
				+    """
			
 
				+    Collect all latest CRF submissions for a given year using sparse arrays
			
 
				+    """
			
 
				+    if submission_type == "CRF":
			
 
				+        countries = all_crf_countries
			
 
				+    elif submission_type == "CRT":
			
 
				+        countries = all_countries
			
 
				+    else:
			
 
				+        raise ValueError("Type must be CRF or CRT")  # noqa: TRY003
			
 
				+
			
 
				+    countries = countries[:n_countries]
			
 
				+    ds_all_CRF = {}
			
 
				+    outdated_countries = []
			
 
				+    included_countries = []
			
 
				+
			
 
				+    for country in countries:
			
 
				+        # determine folder
			
 
				+        try:
			
 
				+            country_info = get_input_and_output_files_for_country(
			
 
				+                country,
			
 
				+                submission_year=submission_year,
			
 
				+                submission_type=submission_type,
			
 
				+                verbose=False,
			
 
				+            )
			
 
				+
			
 
				+            if submission_type == "CRF":
			
 
				+                date_or_version = country_info["date"]
			
 
				+            else:
			
 
				+                date_or_version = country_info["version"]
			
 
				+            # check if the latest submission has been read already
			
 
				+
			
 
				+            data_read = submission_has_been_read(
			
 
				+                country_info["code"],
			
 
				+                country_info["name"],
			
 
				+                submission_year=submission_year,
			
 
				+                date_or_version=date_or_version,
			
 
				+                submission_type=submission_type,
			
 
				+                verbose=False,
			
 
				+            )
			
 
				+            if not data_read:
			
 
				+                print(f"Latest submission for {country} has not been read yet.")
			
 
				+                # TODO: make sure an older submission is read if present.
			
 
				+                #  currently none is included at all
			
 
				+                outdated_countries.append(country)
			
 
				+
			
 
				+            # read the native format file
			
 
				+            # print(country_info["output"])
			
 
				+            input_files = [
			
 
				+                file for file in country_info["output"] if Path(file).suffix == ".nc"
			
 
				+            ]
			
 
				+
			
 
				+            datalad.api.get(input_files)
			
 
				+
			
 
				+            ds_country = pm2.open_dataset(input_files[0])
			
 
				+
			
 
				+            ds_all_CRF[country] = ds_country
			
 
				+
			
 
				+            included_countries.append(country)
			
 
				+
			
 
				+        except Exception as ex:
			
 
				+            print(f"Exception {ex} occurred for {country}")
			
 
				+
			
 
				+    # show info
			
 
				+    print(f"The following countries are included in the dataset: {included_countries}")
			
 
				+    print(
			
 
				+        f"The following countries have updated submission not yet read "
			
 
				+        f"and not included in the dataset: {outdated_countries}"
			
 
				+    )
			
 
				+
			
 
				+    countries = xr.DataTree.from_dict(ds_all_CRF)
			
 
				+    dt = xr.DataTree(name="All", children=countries)
			
 
				+
			
 
				+    return dt
			
 
				+
			
--- a/tests/integration/test_crf_for_year.py
+++ b/tests/integration/test_crf_for_year.py
@@ -26,7 +26,7 @@ def test_crf_for_year_original_version(tmp_path):
 
				 
			
 
				 def test_crf_for_year_sparse_arrays(tmp_path):
			
 
				     start = timer()
			
 
				-    n_countries = 10
			
 
				+    n_countries = 20 # 100 will find 26 countries
			
 
				     crf_raw_for_year_sparse_arrays(
			
 
				         submission_year=1,
			
 
				         submission_type="CRT",
			
@@ -53,6 +53,3 @@ def test_crf_for_year_pandas(tmp_path):
 
				 
			
 
				     end = timer()
			
 
				     print(f"Processing time: {end - start} seconds for {n_countries} countries")
			
 
				-
			
 
				-
			
 
				-#
			
--- a/tests/integration/test_datatree_for_crt_crf.py
+++ b/tests/integration/test_datatree_for_crt_crf.py
@@ -0,0 +1,107 @@
 
				+import pytest
			
 
				+
			
 
				+from src.unfccc_ghg_data.unfccc_crf_reader.crf_raw_for_year_sparse_arrays import (
			
 
				+    crf_raw_for_year_datatree
			
 
				+)
			
 
				+import primap2 as pm
			
 
				+import xarray as xr
			
 
				+import numpy as np
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def crt_dt(tmp_path):
			
 
				+    """Return a data tree object with CRT data for 25 countries"""
			
 
				+    n_countries = 100  # 25 of which are filled
			
 
				+    crt_dt = crf_raw_for_year_datatree(
			
 
				+        submission_year=1,
			
 
				+        submission_type="CRT",
			
 
				+        n_countries=n_countries,
			
 
				+        output_folder=tmp_path,
			
 
				+    )
			
 
				+
			
 
				+    return crt_dt
			
 
				+
			
 
				+def test_country_aggregation(crt_dt):
			
 
				+
			
 
				+    # Nigeria (NGA)
			
 
				+    expected_NGA = 123840.3389332373
			
 
				+    assert np.squeeze(crt_dt["/NGA"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA
			
 
				+    # NGA has data from 2000 to 2022
			
 
				+    assert len(crt_dt["/NGA"].time) == 23
			
 
				+
			
 
				+    # Georgia (GEO)
			
 
				+    expected_GEO = 10954.899111969342
			
 
				+    assert np.squeeze(crt_dt["/GEO"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_GEO
			
 
				+    # GEO has data from 1990 to 2022
			
 
				+    assert len(crt_dt["/GEO"].time) == 33
			
 
				+    # we have a value for 1990
			
 
				+    assert np.squeeze(crt_dt["/GEO"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "1990-01-01"}).to_numpy()).item()
			
 
				+
			
 
				+    # Remove country dimension and add
			
 
				+    crt_dt["/NGA_GEO"] = crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)" : "NGA"}] + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)" : "GEO"}]
			
 
				+
			
 
				+    # Check if the sum of NGA and GEO is equal to NGA_GEO
			
 
				+    # This will perform an inner join, NGA has 23 time steps, GEO has 33 time steps,
			
 
				+    # and the result will have 23 time steps
			
 
				+    assert np.squeeze(crt_dt["/NGA_GEO"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA + expected_GEO
			
 
				+    assert len(crt_dt["/NGA_GEO"].time) == 23
			
 
				+
			
 
				+    # The default option for arithmetic operations is to perform an inner join
			
 
				+    # set options to perform outer join
			
 
				+    with xr.set_options(arithmetic_join="outer"):
			
 
				+        crt_dt["/NGA_GEO_outer"] = crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)" : "NGA"}] + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)" : "GEO"}]
			
 
				+
			
 
				+    # Where there is overlap between NGA and GEO, the values will be added
			
 
				+    assert np.squeeze(crt_dt["/NGA_GEO_outer"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA + expected_GEO
			
 
				+    # When only one time series has a value, the value will be set to nan
			
 
				+    # TODO Is that the behaviour we need?
			
 
				+    assert np.isnan(np.squeeze(crt_dt["/NGA_GEO_outer"]["CO2"].sel(
			
 
				+        {"category (CRT1)" : "1", "class" : "Total", "time" : "1990-01-01"}).to_numpy()).item())
			
 
				+    # Extra time steps will be added
			
 
				+    assert len(crt_dt["/NGA_GEO_outer"].time) == 33
			
 
				+
			
 
				+    # How can we use data tree methods for this?
			
 
				+
			
 
				+    # filter countries we need
			
 
				+    dt_geo_nga = crt_dt.filter(lambda x: x.name in ["NGA", "GEO"])
			
 
				+
			
 
				+    # fails because, x will be a data set view object that does not have the attribute to_dataset
			
 
				+    # dt_geo_nga = dt_geo_nga.map_over_datasets(lambda x: x.to_dataset().pr.loc[{"area (ISO3)" : x.name}])
			
 
				+
			
 
				+
			
 
				+
			
 
				+def test_crf_for_year_original_version(crt_dt):
			
 
				+
			
 
				+    print(crt_dt.groups)
			
 
				+    # output:
			
 
				+    # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL', '/CHN', '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
			
 
				+    # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV', '/MUS', '/MAR', '/NAM', '/NGA')
			
 
				+    # out of the 100 countries, 26 have CRT data
			
 
				+
			
 
				+    # to make it more interesting, we also add primap-hist to the data tree
			
 
				+    primap_hist = pm.open_dataset("../../data/Guetschow_et_al_2025-PRIMAP-hist_v2.6.1_final_13-Mar-2025.nc")
			
 
				+    # Assign primap hist to the data tree
			
 
				+    crt_dt["primap_hist"] = primap_hist
			
 
				+    print(crt_dt.groups)
			
 
				+    # output:
			
 
				+    # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL', '/CHN', '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
			
 
				+    # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV', '/MUS', '/MAR', '/NAM', '/NGA', '/primap_hist')
			
 
				+
			
 
				+    # try some queries
			
 
				+
			
 
				+    # get the data for category 1
			
 
				+    # this will filter the primap-hist data set, because the CRT comes with
			
 
				+    # the coordinate "category (CRT1)"
			
 
				+    # Note that the data tree will have the same structure, only the individual
			
 
				+    # data sets will be filtered
			
 
				+    # print(dt_crt.sel({"category (IPCC2006_PRIMAP)" : "1"}))
			
 
				+
			
 
				+    # we can re-assign the primap hist data set with a renamed category dimension
			
 
				+    crt_dt["primap_hist"] = primap_hist.rename({"category (IPCC2006_PRIMAP)" : "category (CRT1)"})
			
 
				+
			
 
				+    # Now we should be able to filter for categories over all nodes
			
 
				+    dt_cat1 = crt_dt.sel({"category (CRT1)" : "1"})