Browse Source

functions for DI data processing and conversion to IPCC2006 categories

Johannes Gütschow 1 year ago
parent
commit
d74f3ee775

+ 210 - 0
code/UNFCCC_DI_reader/UNFCCC_DI_reader_config.py

@@ -241,4 +241,214 @@ di_to_pm2if_template_ai = {
     "time_format": "%Y",
 }
 
+cat_conversion = {
+    # ANNEXI to come (low priority as we read from CRF files)
+    'BURDI_to_IPCC2006_PRIMAP': {
+        'mapping': {
+            '1': '1',
+            '1.A': '1.A',
+            '1.A.1': '1.A.1',
+            '1.A.2': '1.A.2',
+            '1.A.3': '1.A.3',
+            '1.A.4': '1.A.4',
+            '1.A.5': '1.A.5',
+            '1.B': '1.B',
+            '1.B.1': '1.B.1',
+            '1.B.2': '1.B.2',
+            '2.A': '2.A',
+            '2.B': 'M.2.B_2.B',
+            '2.C': '2.C',
+            '2.D': 'M.2.H.1_2',
+            '2.E': 'M.2.B_2.E',
+            '2.F': '2.F',
+            '2.G': '2.H.3',
+            '4': 'M.AG',
+            '4.A': '3.A.1',
+            '4.B': '3.A.2',
+            '4.C': '3.C.7',
+            '4.D': 'M.3.C.45.AG',
+            '4.E': '3.C.1.c',
+            '4.F': '3.C.1.b',
+            '4.G': '3.C.8',
+            '5': 'M.LULUCF',
+            '6': '4',
+            '6.A': '4.A',
+            '6.B': '4.D',
+            '6.C': '4.C',
+            '6.D': '4.E',
+            '24540': '0',
+            '15163': 'M.0.EL',
+            '14637': 'M.BK',
+            '14424': 'M.BK.A',
+            '14423': 'M.BK.M',
+            '14638': 'M.BIO',
+            '7': '5',
+        }, #5.A-D ignored as not fitting 2006 cats
+        'aggregate': {
+            '2.B': {'sources': ['M.2.B_2.B', 'M.2.B_2.E'], 'name': 'Chemical Industry'},
+            '2.H': {'sources': ['M.2.H.1_2', '2.H.3'], 'name': 'Other'},
+            '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
+                  'name': 'Industrial Processes and Product Use'},
+            '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
+            '3.C.1': {'sources': ['3.C.1.b', '3.C.1.c'],
+                         'name': 'Emissions from biomass burning'},
+            'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
+                         'name': 'Emissions from biomass burning (Agriculture)'},
+            '3.C': {'sources': ['3.C.1', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
+                         'name': 'Aggregate sources and non-CO2 emissions sources on land'},
+            'M.3.C.AG': {'sources': ['M.3.C.1.AG', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
+                         'name': 'Aggregate sources and non-CO2 emissions sources on land ('
+                                 'Agriculture)'},
+            'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
+        },
+    },
+}
+
+di_processing_templates = {
+    # templates fro the DI processing. Most processing rules will apply to several
+    # versions. So we store them here and refer to them in the processing info dict
+    'BFA': {
+        'DI2022-08-22': { # remove 2007, seems to have summed sectors (Agri and LULUCF)
+            # and missing sectors (e.g. 1,2 for CH4, N2O)
+            'remove_years': ['2007'],
+        },
+    },
+    'BIH': {
+        'DI2022-08-22': {
+            # downscaling in two steps
+            # 1990-2001 has different coverage than 2002-2012 and 2013-2014
+            # do not downscale KyotoGHG for 1990-2001 as that's aggregated
+            # later to avoid inconsistencies
+            'downscale': {
+                'sectors': {
+                    '1.A_1990': {
+                        'basket': '1.A',
+                        'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4',
+                                            '1.A.5'],
+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '1.B_1990': {
+                        'basket': '1.B',
+                        'basket_contents': ['1.B.1', '1.B.2'],
+                        'entities': ['CH4', 'CO2', 'NMVOC', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '2_1990': {
+                        'basket': '2',
+                        'basket_contents': ['2.A', '2.B', '2.C', '2.D'],
+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '4_1990': {
+                        'basket': '4',
+                        'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
+                        'entities': ['CH4', 'N2O'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '5_1990': {
+                        'basket': '5',
+                        'basket_contents': ['5.A'],
+                        'entities': ['CO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '6_1990': {
+                        'basket': '6',
+                        'basket_contents': ['6.A'],
+                        'entities': ['CH4'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                },
+                'entities': { # 2002-2014
+                    'KYOTO': {
+                        'basket': 'KYOTOGHG (SARGWP100)',
+                        'basket_contents': ['CH4', 'CO2', 'N2O'],
+                        'sel': {'category (BURDI)':
+                                    ['1' ,'1.A' ,'1.A.1', '1.A.2', '1.A.3', '1.A.4',
+                                     '1.A.5', '1.B', '1.B.1', '1.B.2', '2', '2.A',
+                                     '2.B', '2.C', '2.D', '2.E', '4', '4.A', '4.B',
+                                     '4.C', '4.D', '4.E', '5', '5.A', '6', '6.A',
+                                     '6.B', '6.C', '14423', '14424', '14637',
+                                     '15163', '24540',
+                                     ],
+                                'time': ['2002', '2003', '2004', '2005', '2006',
+                                         '2007', '2008', '2009', '2010', '2011',
+                                         '2012', '2013', '2014'],
+                                },
+                    },
+                },
+            },
+        },
+    },
+}
+
+di_processing_info = {
+    # only countries with special processing listet
+    # category conversion is defined on a country group level
+    # the 'default' option is used if no specific option is found such that
+    # processing of new versions can be done before creating a configuration for the
+    # version.
+    'BFA': {
+        'default': di_processing_templates['BFA']['DI2022-08-22'],
+        'DI2022-08-22': di_processing_templates['BFA']['DI2022-08-22'],
+    },
+    'BIH': {
+        'default': di_processing_templates['BIH']['DI2022-08-22'],
+        'DI2022-08-22': di_processing_templates['BIH']['DI2022-08-22'],
+    },
+}
 
+gas_baskets = {
+    'HFCS (SARGWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'OTHERHFCS (SARGWP100)'],
+    'HFCS (AR4GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'Unspecified mix of HFCs (AR4GWP100)'],
+    'HFCS (AR5GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee'],
+    'PFCS (SARGWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
+    'PFCS (AR4GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',  'Unspecified mix of PFCs (AR4GWP100)'],
+    'PFCS (AR5GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
+    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
+    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
+    'FGASES (AR5GWP100)': ['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
+    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (SARGWP100)', 'PFCS (SARGWP100)'],
+    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)',
+                             'Unspecified mix of HFCs (AR4GWP100)', 'Unspecified mix of PFCs (AR4GWP100)'],
+    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR5GWP100)', 'PFCS (AR5GWP100)'],
+}

+ 274 - 7
code/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py

@@ -1,19 +1,23 @@
 import primap2 as pm2
 import unfccc_di_api
 import pandas as pd
+import numpy as np
 import pycountry
 import itertools
 import json
 import copy
 import xarray as xr
 from datetime import date
-from typing import Optional, Dict
+from typing import Optional, Dict, List
 from pathlib import Path
+from copy import deepcopy
 
 from UNFCCC_DI_reader_config import di_to_pm2if_template_nai
 from UNFCCC_DI_reader_config import di_to_pm2if_template_ai
 from UNFCCC_DI_reader_config import di_query_filters
+from UNFCCC_DI_reader_config import cat_conversion
 from util import NoDIDataError, extracted_data_path, get_country_name
+from util import nAI_countries, AI_countries
 
 
 def read_UNFCCC_DI_for_party(
@@ -27,7 +31,8 @@ def read_UNFCCC_DI_for_party(
         debug: Optional[bool]=False,
 ):
     """
-    # TODO
+    reads data for a party from the UNFCCC DI interface and saves to native and
+    interchange format
     """
 
     # read the data
@@ -44,11 +49,11 @@ def read_UNFCCC_DI_for_party(
 
     # determine filename
     if save_data:
-        filename = determine_filename(party_code, date_str)
+        filename = determine_filename(party_code, date_str, True)
     else:
         filename = None
 
-    # convert it to pm2 interchange format and save
+    # convert raw data to pm2 interchange format and save
     data_if = convert_DI_data_to_pm2_if(
         data=data_df,
         pm2if_specifications=pm2if_specifications,
@@ -58,7 +63,7 @@ def read_UNFCCC_DI_for_party(
         debug=debug,
     )
 
-    # convert to native pm2 format and save that
+    # convert raw data to native pm2 format and save that
     data_pm2 = convert_DI_IF_data_to_pm2(
         data_di_if=data_if,
         filename=filename,
@@ -67,6 +72,202 @@ def read_UNFCCC_DI_for_party(
     return data_pm2
 
 
+def process_UNFCCC_DI_for_party(
+        data_country: xr.Dataset,
+        country: str,
+        cat_terminology_in: str,
+        entities_to_ignore: List[str],
+        sectors: List[str],
+        gas_baskets: Dict[str, List[str]],
+        processing_info_country: Dict = None,
+) -> xr.Dataset:
+    """
+        Process data from DI interface (where necessary).
+        * Downscaling including subtraction of time series
+        * country specific sector aggregation
+        * Conversion to IPCC2006 categories
+        * general sector and gas basket aggregation (in new categories)
+    """
+    #### 1: general processing
+    # remove unused cats
+    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
+    # remove unused years
+    data_country = data_country.dropna(f'time', how='all')
+    # remove variables only containing nan
+    nan_vars_country = [var for var in data_country.data_vars if
+                        data_country[var].isnull().all().data == True]
+    data_country = data_country.drop_vars(nan_vars_country)
+
+    # remove unnecessary variables
+    entities_ignore_present = [entity for entity in entities_to_ignore if
+                               entity in data_country.data_vars]
+    data_country = data_country.drop_vars(entities_ignore_present)
+
+    #### 2: country specific processing
+    if processing_info_country is not None:
+        if 'tolerance' in processing_info_country:
+            tolerance = processing_info_country["tolerance"]
+        else:
+            tolerance = 0.01
+
+        # take only desired years
+        if 'years' in processing_info_country:
+            data_country = data_country.pr.loc[
+                {'time': processing_info_country['years']}]
+
+        # remove timeseries if desired
+        if 'remove_ts' in processing_info_country:
+            for case in processing_info_country['remove_ts']:
+                remove_info = processing_info_country['remove_ts'][case]
+                entities = remove_info.pop("entities")
+                for entity in entities:
+                    data_country[entity].pr.loc[remove_info] = \
+                        data_country[entity].pr.loc[remove_info] * np.nan
+
+        # remove all data for given years if necessary
+        if 'remove_years' in processing_info_country:
+            data_country.pr.loc[{'time': processing_info_country['remove_years']}] = \
+                data_country.pr.loc[{'time': processing_info_country[
+                    'remove_years']}] * np.nan
+
+        # subtract categories
+        if 'subtract_cats' in processing_info_country:
+            subtract_cats_current = processing_info_country['subtract_cats']
+            if 'entities' in subtract_cats_current.keys():
+                entities_current = subtract_cats_current['entities']
+            else:
+                entities_current = list(data_country.data_vars)
+            print(f"Subtracting categories for country {country}, entities "
+                  f"{entities_current}")
+            for cat_to_generate in subtract_cats_current:
+                cats_to_subtract = subtract_cats_current[cat_to_generate]['subtract']
+                data_sub = data_country.pr.loc[{'category': cats_to_subtract}].pr.sum(
+                    dim='category', skipna=True, min_count=1)
+                data_parent = data_country.pr.loc[
+                    {'category': subtract_cats_current[cat_to_generate]['parent']}]
+                data_agg = data_parent - data_sub
+                nan_vars = [var for var in data_agg.data_vars if
+                            data_agg[var].isnull().all().data == True]
+                data_agg = data_agg.drop(nan_vars)
+                if len(data_agg.data_vars) > 0:
+                    print(f"Generating {cat_to_generate} through subtraction")
+                    data_agg = data_agg.expand_dims([f'category ('
+                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.assign_coords(
+                        coords={f'category ({cat_terminology_in})':
+                                    (f'category ({cat_terminology_in})',
+                                     [cat_to_generate])})
+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
+                else:
+                    print(f"no data to generate category {cat_to_generate}")
+
+        # downscaling
+        if 'downscale' in processing_info_country:
+            if 'sectors' in processing_info_country['downscale']:
+                sector_downscaling = processing_info_country['downscale']['sectors']
+                for case in sector_downscaling.keys():
+                    print(f"Downscaling for {case}.")
+                    sector_downscaling_current = sector_downscaling[case]
+                    entities = sector_downscaling_current.pop('entities')
+                    for entity in entities:
+                        data_country[entity] = data_country[
+                            entity].pr.downscale_timeseries(
+                            **sector_downscaling_current)  # , skipna_evaluation_dims=None)
+
+            if 'entities' in processing_info_country['downscale']:
+                entity_downscaling = processing_info_country['downscale']['entities']
+                for case in entity_downscaling.keys():
+                    #print(case)
+                    print(data_country.coords[f'category ('
+                                              f'{cat_terminology_in})'].values)
+                    data_country = data_country.pr.downscale_gas_timeseries(
+                        **entity_downscaling[case], skipna=True,
+                        skipna_evaluation_dims=None)
+
+        # aggregate categories
+        if 'aggregate_cats' in processing_info_country:
+            aggregate_cats_current = processing_info_country['aggregate_cats']
+            print(
+                f"Aggregating categories for country {country}")
+            for cat_to_agg in aggregate_cats_current:
+                print(f"Category: {cat_to_agg}")
+                source_cats = aggregate_cats_current[cat_to_agg]['sources']
+                data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
+                    dim='category', skipna=True, min_count=1)
+                nan_vars = [var for var in data_agg.data_vars if
+                            data_agg[var].isnull().all().data == True]
+                data_agg = data_agg.drop(nan_vars)
+                if len(data_agg.data_vars) > 0:
+                    data_agg = data_agg.expand_dims([f'category ('
+                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.assign_coords(
+                        coords={f'category ({cat_terminology_in})':
+                                    (f'category ({cat_terminology_in})', [cat_to_agg])})
+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
+                else:
+                    print(f"no data to aggregate category {cat_to_agg}")
+
+        # aggregate gases if desired
+        if 'aggregate_gases' in processing_info_country:
+            for case in processing_info_country['aggregate_gases'].keys():
+                case_info = processing_info_country['aggregate_gases'][case]
+                data_country[case_info['basket']] = \
+                    data_country.pr.fill_na_gas_basket_from_contents(
+                        **case_info)
+
+    #### 3: map categories
+    if country in nAI_countries:
+        # conversion from BURDI to IPCC2006_PRIMAP needed
+        cat_terminology_out = 'IPCC2006_PRIMAP'
+        data_country = convert_categories(
+            data_country,
+            cat_conversion[f"{cat_terminology_in}_to_{cat_terminology_out}"],
+            cat_terminology_out,
+            debug=False,
+            tolerance=0.01,
+        )
+    else:
+        cat_terminology_out = cat_terminology_in
+
+    # more general processing
+    # reduce categories to output cats
+    cats_to_keep = [cat for cat in
+                    data_country.coords[f'category ({cat_terminology_out})'].values if
+                    cat in sectors]
+    data_country = data_country.pr.loc[{'category': cats_to_keep}]
+
+    # create gas baskets
+    entities_present = set(data_country.data_vars)
+    for basket in gas_baskets.keys():
+        basket_contents_present = [gas for gas in gas_baskets[basket] if
+                                   gas in entities_present]
+        if len(basket_contents_present) > 0:
+            if basket in list(data_country.data_vars):
+                data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
+                    basket=basket, basket_contents=basket_contents_present, min_count=1)
+            else:
+                try:
+                    data_country[basket] = xr.full_like(data_country["CO2"],
+                                                        np.nan).pr.quantify(
+                        units="Gg CO2 / year")
+                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
+                                                  "gwp_context": basket.split(' ')[1][
+                                                                 1:-1]}
+                    data_country[basket] = data_country.pr.gas_basket_contents_sum(
+                        basket=basket, basket_contents=basket_contents_present,
+                        min_count=1)
+                except:
+                    print(f"No gas basket created for {country}")
+
+    # amend title and comment
+    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
+                                                                    f"{date.today()}"
+    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
+                                                                    f"{date.today()}"
+
+    return data_country
+
+
 def read_UNFCCC_DI_for_party_df(
         party_code: str,
         category_groups: Optional[Dict]=None,
@@ -420,7 +621,8 @@ def convert_DI_IF_data_to_pm2(
 
 def determine_filename(
         party_code: str,
-        date_str: str
+        date_str: str,
+        raw: bool=False,
 )->Path:
     """
     Determine the filename for a dataset from given country code and date string.
@@ -432,6 +634,8 @@ def determine_filename(
         ISO 3 letter code of the country
     date_str:
         formatted date string
+    raw:
+        bool specifying if filename fow raw or processed data should be returned
 
     Returns
     _______
@@ -465,10 +669,73 @@ def determine_filename(
         else:
             country_folder.mkdir()
 
-        filename = Path(country_folder) / f"{party_code}_DI_{date_str}"
+        if raw:
+            filename = Path(country_folder) / f"{party_code}_DI_{date_str}_raw"
+        else:
+            filename = Path(country_folder) / f"{party_code}_DI_{date_str}"
 
     return filename
 
+
+def convert_categories(
+        ds_input: xr.Dataset,
+        conversion: Dict[str, Dict[str, str]],
+        #terminology_from: str,
+        terminology_to: str,
+        debug: bool=False,
+        tolerance: float=0.01,
+)->xr.Dataset:
+    ds_converted = ds_input.copy(deep=True)
+    ds_converted.attrs = deepcopy(ds_input.attrs)
+
+    # change category terminology
+    cat_dim = ds_converted.attrs["cat"]
+    ds_converted.attrs["cat"] = f"category ({terminology_to})"
+    ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
+
+    # find categories present in dataset
+    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
+
+    # restrict categories and map category names
+    if 'mapping' in conversion.keys():
+        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
+                                cat in cats_present]
+        ds_converted = ds_converted.pr.loc[
+            {'category': mapping_cats_present}]
+
+        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
+        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
+        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
+                                                   (f'category ({terminology_to})',
+                                                    to_cats)})
+
+    # redo the list of present cats after mapping, as we have new categories in the
+    # target terminology now
+    cats_present_mapped = list(ds_converted.coords[f'category ({terminology_to})'])
+    # aggregate categories
+    if 'aggregate' in conversion:
+        aggregate_cats = conversion['aggregate']
+        for cat_to_agg in aggregate_cats:
+            if debug:
+                print(f"Category: {cat_to_agg}")
+            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
+                           cat in cats_present_mapped]
+            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
+                dim='category', skipna=True, min_count=1)
+            nan_vars = [var for var in data_agg.data_vars if
+                        data_agg[var].isnull().all().data == True]
+            data_agg = data_agg.drop(nan_vars)
+            if len(data_agg.data_vars) > 0:
+                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
+                data_agg = data_agg.assign_coords(
+                    coords={f'category ({terminology_to})':
+                                (f'category ({terminology_to})', [cat_to_agg])})
+                ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
+            else:
+                print(f"no data to aggregate category {cat_to_agg}")
+
+    return ds_converted
+
 # TODO
 
 # functions

+ 5 - 1
code/UNFCCC_DI_reader/util.py

@@ -1,4 +1,5 @@
 from pathlib import Path
+import unfccc_di_api
 # imports for copied functions
 import pycountry
 
@@ -9,13 +10,16 @@ code_path = root_path / "code"
 downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
 extracted_data_path = root_path / "extracted_data" / "UNFCCC"
 
+reader = unfccc_di_api.UNFCCCApiReader()
 
+nAI_countries = list(reader.non_annex_one_reader.parties["code"])
+AI_countries = list(reader.annex_one_reader.parties["code"])
 
 class NoDIDataError(Exception):
     pass
 
 
-# the following is copied from other cub-packages
+# the following is copied from other sub-packages
 # TODO: move these fucntions to common location to allow easy importing into all modules
 custom_country_mapping = {
     "EUA": "European Union",