test_datatree_for_crt_crf.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. import numpy as np
  2. import primap2 as pm
  3. import pytest
  4. import xarray as xr
  5. from src.unfccc_ghg_data.unfccc_crf_reader.crf_raw_for_year_sparse_arrays import (
  6. crf_raw_for_year_datatree,
  7. )
  8. @pytest.fixture
  9. def crt_dt(tmp_path):
  10. """Return a data tree object with CRT data for 25 countries"""
  11. n_countries = 100 # 25 of which are filled
  12. crt_dt = crf_raw_for_year_datatree(
  13. submission_year=1,
  14. submission_type="CRT",
  15. n_countries=n_countries,
  16. output_folder=tmp_path,
  17. )
  18. return crt_dt
  19. def test_country_aggregation(crt_dt):
  20. # Nigeria (NGA)
  21. expected_NGA = 123840.3389332373
  22. assert (
  23. np.squeeze(
  24. crt_dt["/NGA"]["CO2"]
  25. .sel({"category (CRT1)": "1", "class": "Total", "time": "2022-01-01"})
  26. .to_numpy()
  27. ).item()
  28. == expected_NGA
  29. )
  30. # NGA has data from 2000 to 2022
  31. assert len(crt_dt["/NGA"].time) == 23
  32. # Georgia (GEO)
  33. expected_GEO = 10954.899111969342
  34. assert (
  35. np.squeeze(
  36. crt_dt["/GEO"]["CO2"]
  37. .sel({"category (CRT1)": "1", "class": "Total", "time": "2022-01-01"})
  38. .to_numpy()
  39. ).item()
  40. == expected_GEO
  41. )
  42. # GEO has data from 1990 to 2022
  43. assert len(crt_dt["/GEO"].time) == 33
  44. # we have a value for 1990
  45. assert np.squeeze(
  46. crt_dt["/GEO"]["CO2"]
  47. .sel({"category (CRT1)": "1", "class": "Total", "time": "1990-01-01"})
  48. .to_numpy()
  49. ).item()
  50. # Remove country dimension and add
  51. crt_dt["/NGA_GEO"] = (
  52. crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)": "NGA"}]
  53. + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)": "GEO"}]
  54. )
  55. # Check if the sum of NGA and GEO is equal to NGA_GEO
  56. # This will perform an inner join, NGA has 23 time steps, GEO has 33 time steps,
  57. # and the result will have 23 time steps
  58. assert (
  59. np.squeeze(
  60. crt_dt["/NGA_GEO"]["CO2"]
  61. .sel({"category (CRT1)": "1", "class": "Total", "time": "2022-01-01"})
  62. .to_numpy()
  63. ).item()
  64. == expected_NGA + expected_GEO
  65. )
  66. assert len(crt_dt["/NGA_GEO"].time) == 23
  67. # The default option for arithmetic operations is to perform an inner join
  68. # set options to perform outer join
  69. with xr.set_options(arithmetic_join="outer"):
  70. crt_dt["/NGA_GEO_outer"] = (
  71. crt_dt["/NGA"].to_dataset().pr.loc[{"area (ISO3)": "NGA"}]
  72. + crt_dt["/GEO"].to_dataset().pr.loc[{"area (ISO3)": "GEO"}]
  73. )
  74. # Where there is overlap between NGA and GEO, the values will be added
  75. assert (
  76. np.squeeze(
  77. crt_dt["/NGA_GEO_outer"]["CO2"]
  78. .sel({"category (CRT1)": "1", "class": "Total", "time": "2022-01-01"})
  79. .to_numpy()
  80. ).item()
  81. == expected_NGA + expected_GEO
  82. )
  83. # When only one time series has a value, the value will be set to nan
  84. # TODO Is that the behaviour we need?
  85. assert np.isnan(
  86. np.squeeze(
  87. crt_dt["/NGA_GEO_outer"]["CO2"]
  88. .sel({"category (CRT1)": "1", "class": "Total", "time": "1990-01-01"})
  89. .to_numpy()
  90. ).item()
  91. )
  92. # Extra time steps will be added
  93. assert len(crt_dt["/NGA_GEO_outer"].time) == 33
  94. # How can we use data tree methods for this?
  95. # filter countries we need
  96. dt_geo_nga = crt_dt.filter(lambda x: x.name in ["NGA", "GEO"]) # noqa: F841
  97. # fails because, x will be a data set view object that does
  98. # not have the attribute to_dataset
  99. # dt_geo_nga = dt_geo_nga.map_over_datasets(
  100. # lambda x: x.to_dataset().pr.loc[{"area (ISO3)" : x.name}])
  101. def test_crf_for_year_original_version(crt_dt):
  102. print(crt_dt.groups)
  103. # output:
  104. # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL', '/CHN',
  105. # '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
  106. # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV',
  107. # '/MUS', '/MAR', '/NAM', '/NGA')
  108. # out of the 100 countries, 26 have CRT data
  109. # to make it more interesting, we also add primap-hist to the data tree
  110. primap_hist = pm.open_dataset(
  111. "../../data/Guetschow_et_al_2025-PRIMAP-hist_v2.6.1_final_13-Mar-2025.nc"
  112. )
  113. # Assign primap hist to the data tree
  114. crt_dt["primap_hist"] = primap_hist
  115. print(crt_dt.groups)
  116. # output:
  117. # ('/', '/DZA', '/ARG', '/AZE', '/BTN', '/BRA', '/BRN', '/CHL',
  118. # '/CHN', '/COL', '/CIV', '/ECU', '/EGY', '/GEO',
  119. # '/GHA', '/GNB', '/GUY', '/IDN', '/KEN', '/LBN', '/MYS', '/MDV',
  120. # '/MUS', '/MAR', '/NAM', '/NGA', '/primap_hist')
  121. # try some queries
  122. # get the data for category 1
  123. # this will filter the primap-hist data set, because the CRT comes with
  124. # the coordinate "category (CRT1)"
  125. # Note that the data tree will have the same structure, only the individual
  126. # data sets will be filtered
  127. # print(dt_crt.sel({"category (IPCC2006_PRIMAP)" : "1"}))
  128. # we can re-assign the primap hist data set with a renamed category dimension
  129. crt_dt["primap_hist"] = primap_hist.rename(
  130. {"category (IPCC2006_PRIMAP)": "category (CRT1)"}
  131. )
  132. # Now we should be able to filter for categories over all nodes
  133. dt_cat1 = crt_dt.sel({"category (CRT1)": "1"}) # noqa: F841