Browse Source

datatree experiments

Daniel Busch 2 weeks ago
parent
commit
9e82d8d059

+ 1 - 1
.gitignore

@@ -1,6 +1,6 @@
 # temporary
 src/unfccc_ghg_data/datasets
-
+data
 # logging
 log/*
 geckodriver.log

+ 92 - 0
src/unfccc_ghg_data/unfccc_crf_reader/crf_raw_for_year_sparse_arrays.py

@@ -20,6 +20,7 @@ import numpy as np
 import pandas as pd
 import primap2 as pm2
 import sparse
+import xarray as xr
 
 from unfccc_ghg_data.helper import all_countries
 from unfccc_ghg_data.unfccc_crf_reader.unfccc_crf_reader_prod import (
@@ -356,3 +357,94 @@ def crf_raw_for_year_pandas(
         f"The following countries have updated submission not yet read "
         f"and not included in the dataset: {outdated_countries}"
     )
+
+
+# source primap hist noch dazu, und select nach Land, Gas
+# data tree aus allen Ländern
+# prototyp für data tree merge, welche dimensionen in tree und welche in sets
+# sum und category aggregation ausprobieren
+# gas baskets
+# künstliches beispiel, category im data tree
+# wie würde man über Länder summieren wenn die Länder im dt sind
+
+
+def crf_raw_for_year_datatree(  # noqa: PLR0912
+    submission_year: int,
+    output_folder: Path,
+    submission_type: str = "CRF",
+    n_countries: int = 5,
+):
+    """
+    Collect all latest CRF submissions for a given year using sparse arrays
+    """
+    if submission_type == "CRF":
+        countries = all_crf_countries
+    elif submission_type == "CRT":
+        countries = all_countries
+    else:
+        raise ValueError("Type must be CRF or CRT")  # noqa: TRY003
+
+    countries = countries[:n_countries]
+    ds_all_CRF = {}
+    outdated_countries = []
+    included_countries = []
+
+    for country in countries:
+        # determine folder
+        try:
+            country_info = get_input_and_output_files_for_country(
+                country,
+                submission_year=submission_year,
+                submission_type=submission_type,
+                verbose=False,
+            )
+
+            if submission_type == "CRF":
+                date_or_version = country_info["date"]
+            else:
+                date_or_version = country_info["version"]
+            # check if the latest submission has been read already
+
+            data_read = submission_has_been_read(
+                country_info["code"],
+                country_info["name"],
+                submission_year=submission_year,
+                date_or_version=date_or_version,
+                submission_type=submission_type,
+                verbose=False,
+            )
+            if not data_read:
+                print(f"Latest submission for {country} has not been read yet.")
+                # TODO: make sure an older submission is read if present.
+                #  currently none is included at all
+                outdated_countries.append(country)
+
+            # read the native format file
+            # print(country_info["output"])
+            input_files = [
+                file for file in country_info["output"] if Path(file).suffix == ".nc"
+            ]
+
+            datalad.api.get(input_files)
+
+            ds_country = pm2.open_dataset(input_files[0])
+
+            ds_all_CRF[country] = ds_country
+
+            included_countries.append(country)
+
+        except Exception as ex:
+            print(f"Exception {ex} occurred for {country}")
+
+    # show info
+    print(f"The following countries are included in the dataset: {included_countries}")
+    print(
+        f"The following countries have updated submission not yet read "
+        f"and not included in the dataset: {outdated_countries}"
+    )
+
+    countries = xr.DataTree.from_dict(ds_all_CRF)
+    dt = xr.DataTree(name="All", children=countries)
+
+    return dt
+

+ 1 - 4
tests/integration/test_crf_for_year.py

@@ -26,7 +26,7 @@ def test_crf_for_year_original_version(tmp_path):
 
 def test_crf_for_year_sparse_arrays(tmp_path):
     start = timer()
-    n_countries = 10
+    n_countries = 20 # 100 will find 26 countries
     crf_raw_for_year_sparse_arrays(
         submission_year=1,
         submission_type="CRT",
@@ -53,6 +53,3 @@ def test_crf_for_year_pandas(tmp_path):
 
     end = timer()
     print(f"Processing time: {end - start} seconds for {n_countries} countries")
-
-
-#

+ 107 - 0
tests/integration/test_datatree_for_crt_crf.py

@@ -0,0 +1,107 @@
+import pytest
+
+from src.unfccc_ghg_data.unfccc_crf_reader.crf_raw_for_year_sparse_arrays import (
+    crf_raw_for_year_datatree
+)
+import primap2 as pm
+import xarray as xr
+import numpy as np
+
+@pytest.fixture
+def crt_dt(tmp_path):
+    """Return a data tree object with CRT data for 25 countries"""
+    n_countries = 100  # 25 of which are filled
+    crt_dt = crf_raw_for_year_datatree(
+        submission_year=1,
+        submission_type="CRT",
+        n_countries=n_countries,
+        output_folder=tmp_path,
+    )
+
+    return crt_dt
+
+def test_country_aggregation(crt_dt):
+
+    # Nigeria (NGA)
+    expected_NGA = 123840.3389332373
+    assert np.squeeze(crt_dt["/NGA"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA
+    # NGA has data from 2000 to 2022
+    assert len(crt_dt["/NGA"].time) == 23
+
+    # Georgia (GEO)
+    expected_GEO = 10954.899111969342
+    assert np.squeeze(crt_dt["/GEO"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_GEO
+    # GEO has data from 1990 to 2022
+    assert len(crt_dt["/GEO"].time) == 33
+    # we have a value for 1990
+    assert np.squeeze(crt_dt["/GEO"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "1990-01-01"}).to_numpy()).item()
+
+    # Remove country dimension and add
+    crt_dt["/NGA_GEO"] = crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)" : "NGA"}] + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)" : "GEO"}]
+
+    # Check if the sum of NGA and GEO is equal to NGA_GEO
+    # This will perform an inner join, NGA has 23 time steps, GEO has 33 time steps,
+    # and the result will have 23 time steps
+    assert np.squeeze(crt_dt["/NGA_GEO"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA + expected_GEO
+    assert len(crt_dt["/NGA_GEO"].time) == 23
+
+    # The default option for arithmetic operations is to perform an inner join
+    # set options to perform outer join
+    with xr.set_options(arithmetic_join="outer"):
+        crt_dt["/NGA_GEO_outer"] = crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)" : "NGA"}] + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)" : "GEO"}]
+
+    # Where there is overlap between NGA and GEO, the values will be added
+    assert np.squeeze(crt_dt["/NGA_GEO_outer"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "2022-01-01"}).to_numpy()).item() == expected_NGA + expected_GEO
+    # When only one time series has a value, the value will be set to nan
+    # TODO Is that the behaviour we need?
+    assert np.isnan(np.squeeze(crt_dt["/NGA_GEO_outer"]["CO2"].sel(
+        {"category (CRT1)" : "1", "class" : "Total", "time" : "1990-01-01"}).to_numpy()).item())
+    # Extra time steps will be added
+    assert len(crt_dt["/NGA_GEO_outer"].time) == 33
+
+    # How can we use data tree methods for this?
+
+    # filter countries we need
+    dt_geo_nga = crt_dt.filter(lambda x: x.name in ["NGA", "GEO"])
+
+    # fails because, x will be a data set view object that does not have the attribute to_dataset
+    # dt_geo_nga = dt_geo_nga.map_over_datasets(lambda x: x.to_dataset().pr.loc[{"area (ISO3)" : x.name}])
+
+
+
+def test_crf_for_year_original_version(crt_dt):
+
+    print(crt_dt.groups)
+    # output:
+    # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL', '/CHN', '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
+    # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV', '/MUS', '/MAR', '/NAM', '/NGA')
+    # out of the 100 countries, 26 have CRT data
+
+    # to make it more interesting, we also add primap-hist to the data tree
+    primap_hist = pm.open_dataset("../../data/Guetschow_et_al_2025-PRIMAP-hist_v2.6.1_final_13-Mar-2025.nc")
+    # Assign primap hist to the data tree
+    crt_dt["primap_hist"] = primap_hist
+    print(crt_dt.groups)
+    # output:
+    # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL', '/CHN', '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
+    # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV', '/MUS', '/MAR', '/NAM', '/NGA', '/primap_hist')
+
+    # try some queries
+
+    # get the data for category 1
+    # this will filter the primap-hist data set, because the CRT comes with
+    # the coordinate "category (CRT1)"
+    # Note that the data tree will have the same structure, only the individual
+    # data sets will be filtered
+    # print(dt_crt.sel({"category (IPCC2006_PRIMAP)" : "1"}))
+
+    # we can re-assign the primap hist data set with a renamed category dimension
+    crt_dt["primap_hist"] = primap_hist.rename({"category (IPCC2006_PRIMAP)" : "category (CRT1)"})
+
+    # Now we should be able to filter for categories over all nodes
+    dt_cat1 = crt_dt.sel({"category (CRT1)" : "1"})