Quellcode durchsuchen

Add code for Taiwan 2023 inventory and some modifications to older code

Johannes Gütschow vor 10 Monaten
Ursprung
Commit
16b87f0f0c

+ 0 - 5
UNFCCC_GHG_data/UNFCCC_reader/Israel/config_ISR_BUR2.py

@@ -388,11 +388,6 @@ cat_conversion = {
         'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National total '
                                                                     'excluding LULUCF'},
     },
-    'basket_copy': {
-        'GWPs_to_add': ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': 'SARGWP100',
-    },
 }
 
 sectors_to_save = [

+ 0 - 32
UNFCCC_GHG_data/UNFCCC_reader/Taiwan/config_TWN_NIR2022.py

@@ -55,38 +55,6 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         data = data.reset_index(drop=True)
     return data
 
-def make_wide_table(data: pd.DataFrame, keyword: str, col: Union[int, str], index_cols: List[Union[int, str]])->pd.DataFrame:
-    index = data.loc[data[col] == keyword].index
-    if not list(index):
-        print("Keyword for table transformation not found")
-        return data
-    elif len(index)==1:
-        print("Keyword for table transformation found only once")
-        return data
-    else:
-        df_all = None
-        for i, item in enumerate(index):
-            loc = data.index.get_loc(item)
-            if i < len(index) - 1:
-                next_loc = data.index.get_loc(index[i + 1])
-            else:
-                next_loc = data.index[-1] + 1
-            df_to_add = data.loc[list(range(loc, next_loc))]
-            # select only cols which don't have NaN, Null, or '' as header
-            filter_nan = ((~df_to_add.iloc[0].isnull()) & (df_to_add.iloc[0] != 'NaN')& (df_to_add.iloc[0] != ''))
-            df_to_add = df_to_add.loc[: , filter_nan]
-            df_to_add.columns = df_to_add.iloc[0]
-            #print(df_to_add.columns)
-            df_to_add = df_to_add.drop(loc)
-            df_to_add = df_to_add.set_index(index_cols)
-            
-            if df_all is None:
-                df_all = df_to_add
-            else:
-                df_all = pd.concat([df_all, df_to_add], axis=1, join='outer')
-        return df_all
-        
-
 # page defs tp hold information on reading the table
 page_defs = {
     '5': { 

+ 447 - 0
UNFCCC_GHG_data/UNFCCC_reader/Taiwan/config_TWN_NIR2023.py

@@ -0,0 +1,447 @@
+# config and functions for Taiwan NIR 2022
+
+from typing import Union, List
+import pandas as pd
+import xarray as xr
+from typing import Optional, Any
+
+gwp_to_use = "AR4GWP100"
+terminology_proc = 'IPCC2006_PRIMAP'
+
+##### Table definitions
+# page defs to hold information on reading the table
+page_defs = {
+    '5': { 
+        "table_areas": ['36,523,563,68'],
+        "split_text": False,
+        "flavor": "stream",
+    },
+    '6': {
+        "table_areas": ['34,562,563,53'],
+        #"columns": ['195,228,263,295,328,363,395,428,462,495,529'], # works without
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '7': {
+        "table_areas": ['36,743,531,482', '36,425,564,54'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '8': {
+        "table_areas": ['35,748,534,567'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '9': {
+        "table_areas": ['34,753,565,286', '34,235,565,63'],
+        "split_text": False,
+        "flavor": "stream",
+    },
+    '10': {
+        "table_areas": ['34,753,565,449'],
+        "split_text": False,
+        "flavor": "stream",
+    },
+    '11': {
+        "table_areas": ['32,522,566,208'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '12': {
+        "table_areas": ['33,549,562,64'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '13': {
+        "table_areas": ['31,761,532,517'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '14': {
+        "table_areas": ['32,751,563,70'],
+        "columns": ['217,250,282,313,344,374,406,437,468,501,531'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '15': {
+        "table_areas": ['32,345,565,53'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '16': {
+        "table_areas": ['32,745,532,597'],
+        "split_text": True,
+        "flavor": "stream",
+    },
+    '18': {
+        "table_areas": ['30,747,564,260'],
+        "columns": ['188,232,263,298,331,362,398,432,464,497,530'],
+        "split_text": True,
+        "flavor": "stream",
+    }, # correct mistakes later
+}
+
+# table defs to hold information on how to process the tables
+table_defs = {
+    'ES2.2': { # 1990-2021 Carbon Dioxide Emissions and Sequestration in Taiwan
+        "tables": [1, 2],
+        "rows_to_fix": {
+            0: { 
+                3: ['1.A.4.c Agriculture, Forestry, Fishery, and',
+                    '2.D Non-Energy Products from Fuels and', 
+                    '4. Land Use, Land Use Change and Forestry'],
+            },
+        },
+        "index_cols": ['GHG Emission Source and Sinks'],
+        "wide_keyword": 'GHG Emission Source and Sinks',
+        "col_wide_kwd": 0, 
+        "entity": "CO2",
+        "unit": "kt",
+        "cat_codes_manual": {
+            'Net GHG Emission (including LULUCF)': '0',
+            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+        },            
+    },
+    'ES2.3': { # 1990-2021 Methane Emissions in Taiwan
+        "tables": [3, 4],
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "entity": f"CH4 ({gwp_to_use})",
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total Methane Emissions': '0',
+        },
+        "drop_rows": [
+            "5.B Garbage Biological Treatment", # has lower significant digits than in table ES3.6
+            "2. Industrial Process and Product Use Sector",  # inconsistent with subsector sum (rounding)
+        ],
+    },
+    'ES2.4': { # 1990-2021 Nitrous Oxide Emissions in Taiwan
+        "tables": [5],
+        "fix_cats": {
+            0: {
+                "Total Nitrous Oxide Emissionsl": "Total Nitrous Oxide Emissions",
+            },
+        },            
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "entity": f"N2O ({gwp_to_use})",
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total Nitrous Oxide Emissions': '0',
+        },
+        "drop_rows": [
+            "3.F Field Burning of Agricultural Residues", # has lower significant digits than in table ES3.4
+            "5. Waste Sector", # error in 1996 data
+        ],
+    },
+    'ES2.5': { # 1990-2021 Fluoride-Containing Gas Emissions in Taiwan
+        "tables": [6,7],
+        "fix_cats": {},
+        "rows_to_fix": {
+            0: {
+                -2: ['Total PFCs Emissions (2.E Electronics Industry)',
+                    'Total SF6 Emissions',
+                    'Total NF3 Emissions (2.E Electronics Industry)'],
+            },
+        },
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0,
+        "gas_splitting": {
+            "Total HFCs Emissions": f"HFCS ({gwp_to_use})",
+            "Total PFCs Emissions (2.E Electronics Industry)": f"PFCS ({gwp_to_use})",
+            "Total SF6 Emissions": f"SF6 ({gwp_to_use})",
+            "Total NF3 Emissions (2.E Electronics Industry)": f"NF3 ({gwp_to_use})",
+            "Total Fluoride-Containing Gas Emissions": f"FGASES ({gwp_to_use})",
+            "GHG Emission Sources and Sinks": "entity",
+        },
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            "Total HFCs Emissions": "2",
+            "Total PFCs Emissions (2.E Electronics Industry)": "2.E",
+            "Total SF6 Emissions": "2",
+            "Total NF3 Emissions (2.E Electronics Industry)": "2.E",
+            "Total Fluoride-Containing Gas Emissions": "2",
+        },
+    },
+    'ES3.1': { # 1990-2021 Greenhouse Gas Emission in Taiwan by Sector
+        "tables": [8],
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "entity": f"KYOTOGHG ({gwp_to_use})",
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Net GHG Emission (including LULUCF)': '0',
+            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+        },
+    },
+    'ES3.2': { # 1990-2021 Greenhouse Gas Emissions Produced by Energy Sector in Taiwan
+        "tables": [9,10],
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "gas_splitting": {
+            "Total CO2 Emission": "CO2",
+            "Total CH4 Emission": f"CH4 ({gwp_to_use})",
+            "Total N2O Emission": f"N2O ({gwp_to_use})",
+            "Total Emission from Energy Sector": f"KYOTOGHG ({gwp_to_use})",
+            "GHG Emission Sources and Sinks": "entity",
+        },
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total CO2 Emission': '1',
+            'Total CH4 Emission': '1',
+            'Total N2O Emission': '1',
+            'Total Emission from Energy Sector': '1',
+        },
+    },
+    'ES3.3': { # 1990-2021 Greenhouse Gas Emissions Produced by Industrial Process and Product Use Sector (IPPU) in Taiwan
+        "tables": [11],
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "gas_splitting": {
+            "Total CO2 Emission": "CO2",
+            "Total CH4 Emission": f"CH4 ({gwp_to_use})",
+            "Total N2O Emission": f"N2O ({gwp_to_use})",
+            "Total HFCs Emission": f"HFCS ({gwp_to_use})",
+            "Total PFCs Emission (2.E Electronics Industry)": f"PFCS ({gwp_to_use})",
+            "Total SF6 Emission": f"SF6 ({gwp_to_use})",
+            "Total NF3 Emission (2.E Electronics Industry)": f"NF3 ({gwp_to_use})",
+            "Total Emission from IPPU Sector": f"KYOTOGHG ({gwp_to_use})",
+            "GHG Emission Sources and Sinks": "entity",
+        },
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total CO2 Emission': '2',
+            'Total CH4 Emission': '2',
+            'Total N2O Emission': '2',
+            'Total HFCs Emission': '2',
+            'Total PFCs Emission (2.E Electronics Industry)': '2.E',
+            'Total SF6 Emission': '2',
+            'Total NF3 Emission (2.E Electronics Industry)': '2.E',
+            'Total Emission from IPPU Sector': '2',
+        },
+        "drop_rows": [
+        #     ("2.D Non-Energy Products from Fuels and Solvent Use", "CO2"), # has lower significant digits than in table ES2.2
+            "Total CH4 Emission",  # inconsistent with subsectors (rounding)
+        ]
+    }, 
+    'ES3.4': { # 1990-2021 Greenhouse Gas Emissions Produced by Agriculture Sector in Taiwan
+        "tables": [12,13],
+        "rows_to_fix": {},
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        "gas_splitting": {
+            "Total CO2 Emission (3.H Urea applied)": "CO2",
+            "Total CH4 Emission": f"CH4 ({gwp_to_use})",
+            "Total N2O Emission": f"N2O ({gwp_to_use})",
+            "Total Emission From Agriculture Sector": f"KYOTOGHG ({gwp_to_use})",
+            "GHG Emission Sources and Sinks": "entity",
+        },
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total CO2 Emission (3.H Urea applied)': '3.H',
+            'Total CH4 Emission': '3',
+            'Total N2O Emission': '3',
+            'Total Emission From Agriculture Sector': '3',
+        },
+    }, 
+    'ES3.6': { # 1990-2020 Greenhouse Gas Emissions in Taiwan by Waste Sector
+        "tables": [14],
+        "rows_to_fix": {
+            0: {
+                3: ["Total CO2 Emission"],
+            },
+        }, 
+        "index_cols": ['GHG Emission Sources and Sinks'], 
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, # two column header
+        "gas_splitting": {
+            "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "CO2",
+            "Total CH4 Emission": f"CH4 ({gwp_to_use})",
+            "Total N2O Emission": f"N2O ({gwp_to_use})",
+            "Total Emission from Waste Sector": f"KYOTOGHG ({gwp_to_use})",
+            "GHG Emission Sources and Sinks": "entity",
+        },
+        "unit": "ktCO2eq",
+        "cat_codes_manual": {
+            'Total CO2 Emission (5.C Incineration and Open Burning of Waste)': '5.C',
+            'Total CH4 Emission': '5',
+            'Total N2O Emission': '5',
+            'Total Emission from Waste Sector': '5',
+        },
+    }, 
+}
+
+table_defs_skip = {
+    'ES2.1': { # 1990-2020 Greenhouse Gas Emissions and Sequestration in Taiwan by Type
+        "tables": [0],
+        "rows_to_fix": {
+            0: { 
+                3: ['CO2'],
+            },
+            1: {  # wherte col 0 is empty
+                3: ['Net GHG Emission', 'Total GHG Emission'],
+            },
+        },
+        "index_cols": ['GHG', 'GWP'],
+        "wide_keyword": 'GHG',
+        "col_wide_kwd": 0, 
+        "unit": "ktCO2eq",
+    },
+    'ES2.5': { # 1990-2020 Fluoride-Containing Gas Emissions in Taiwan
+        "tables": [6],
+        "rows_to_fix": {
+            0: {
+                -2: ['Total SF6 Emissions', 
+                     'Total NF3 Emissions'],
+            },
+        },
+        "index_cols": ['GHG Emission Sources and Sinks'],
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, 
+        #"entity": "CO2",
+        "unit": "ktCO2eq",
+    },
+    'ES3.5': { # skip for now: 1990-2020 Changes in Carbon Sequestration by LULUCF Sector in Taiwan2],
+        "tables": [12],
+        "rows_to_fix": {}, 
+        "index_cols": ['GHG Emission Sources and Sinks'], #header is merged col :-(
+        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "col_wide_kwd": 0, # two column header
+        "unit": "kt",
+        "entity": "CO2",
+    }, # need to consider the two columns specially (merge?)
+}
+
+
+##### primap2 metadata
+cat_code_regexp = r'(?P<UNFCCC_GHG_data>^[a-zA-Z0-9\.]{1,7})\s.*'
+
+time_format = "%Y"
+
+coords_cols = {
+    "category": "category",
+    "entity": "entity",
+    "unit": "unit",
+    # "area": "Geo_code",
+}
+
+add_coords_cols = {
+    #    "orig_cat_name": ["orig_cat_name", "category"],
+}
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_1996_Taiwan_Inv",
+    "scenario": "PRIMAP",
+}
+
+coords_defaults = {
+    "source": "TWN-GHG-Inventory",
+    "provenance": "measured",
+    "scenario": "2023NIR",
+    "area": "TWN",
+    # unit fill by table
+}
+
+coords_value_mapping = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+}
+
+coords_value_filling = {}
+
+#
+filter_remove = {}
+
+filter_keep = {}
+
+meta_data = {
+    "references": "https://www.cca.gov.tw/information-service/publications/national-ghg-inventory-report/1851.html",
+    "rights": "",
+    "contact": "mail@johannes-guetschow.de",
+    "title": "2023 Republic of China - National Greenhouse Gas Report",
+    "comment": "Read fom pdf file and converted to PRIMAP2 format by Johannes Gütschow",
+    "institution": "Republic of China - Environmental Protection Administration",
+}
+
+##### processing information
+cat_conversion = {
+    'mapping': {
+        '0': '0',
+        'M.0.EL': 'M.0.EL',
+        '1': '1',
+        '1.A.1': '1.A.1',
+        '1.A.2': '1.A.2',
+        '1.A.3': '1.A.3',
+        '1.A.4': '1.A.4',
+        '1.A.4.a': '1.A.4.a',
+        '1.A.4.b': '1.A.4.b',
+        '1.A.4.c': '1.A.4.c',
+        '1.B.1': '1.B.1',
+        '1.B.2': '1.B.2',
+        '2': '2',
+        '2.A': '2.A',
+        '2.B': '2.B',
+        '2.C': '2.C',
+        '2.D': '2.D',
+        '2.E': '2.E',
+        '2.F': '2.F',
+        '2.G': '2.G',
+        '2.H': '2.H',
+        '3': 'M.AG',
+        '3.A': '3.A.1',
+        '3.B': '3.A.2',
+        '3.C': '3.C.7',
+        '3.D': 'M.3.AS',
+        '3.F': '3.C.1.b',
+        '3.H': '3.C.3',
+        '4': 'M.LULUCF',
+        '5': '4',
+        '5.A': '4.A',
+        '5.B': '4.B',
+        '5.C': '4.C',
+        '5.D': '4.D',
+        '5.D.1': '4.D.1',
+        '5.D.2': '4.D.2',
+    },
+    'aggregate': {
+        '1.A': {'sources': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
+                'name': 'Fuel Combustion Activities'},
+        '1.B': {'sources': ['1.B.1', '1.B.2'], 'name': 'Fugitive Emissions from Fuels'},
+        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
+              'name': 'Industrial Process and Product Use Sector'},
+        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
+        '3.B': {'sources': ['M.LULUCF'], 'name': 'Land'},
+        '3.C.1': {'sources': ['3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
+        '3.C.5': {'sources': ['3.C.5.a', '3.C.5.b'],
+                  'name': 'Indirect N2O Emissions from Managed Soils'},
+        '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.AS', '3.C.7'],
+                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
+        'M.AG.ELV': {'sources': ['3.C'],
+                     'name': 'Agriculture excluding livestock emissions'},
+        'M.AG': {'sources': ['3.A', '3.C'], 'name': 'Agriculture'},
+        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},  # consistency check
+        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4']}, # consistency check
+        '0': {'sources': ['1', '2', '3', '4']},  # consistency check
+    },
+}
+
+basket_copy = {
+    'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+    'entities': ["HFCS", "PFCS"],
+    'source_GWP': gwp_to_use,
+}
+

+ 228 - 0
UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2023-Inventory_from_pdf.py

@@ -0,0 +1,228 @@
+# this script reads data from Taiwan's 2023 national inventory
+# Data is read from the english summary pdf
+# TODO: add further GWPs and gas baskets
+
+import pandas as pd
+import primap2 as pm2
+import camelot
+import copy
+
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper import compression, make_wide_table
+from UNFCCC_GHG_data.helper import process_data_for_country, gas_baskets
+from primap2.pm2io._data_reading import matches_time_format
+
+from config_TWN_NIR2022 import fix_rows
+from config_TWN_NIR2023 import table_defs, page_defs, cat_code_regexp
+from config_TWN_NIR2023 import terminology_proc
+from config_TWN_NIR2023 import gwp_to_use, basket_copy
+from config_TWN_NIR2023 import coords_cols, add_coords_cols, coords_defaults
+from config_TWN_NIR2023 import coords_terminologies, coords_value_mapping
+from config_TWN_NIR2023 import meta_data, cat_conversion
+
+
+# ###
+# configuration
+# ###
+input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan' / '2023_NIR'
+output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
+if not output_folder.exists():
+    output_folder.mkdir()
+
+output_filename = 'TWN_inventory_2023_'
+inventory_file = '2023_NIR_executive_summary_english.pdf'
+
+# ###
+# read the tables from pdf
+# ###
+
+all_tables = []
+for page in page_defs:
+    print(f"Reading from page {page}")
+    new_tables = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=page,
+        **page_defs[page],
+        )
+    for table in new_tables:
+        all_tables.append(table.df)
+
+
+# ###
+# convert tables to primap2 format
+# ###
+data_pm2 = None
+for table_name in table_defs.keys():
+    print(f"Working on table: {table_name}")
+
+    table_def = copy.deepcopy(table_defs[table_name])
+    # combine all raw tables
+    df_this_table = all_tables[table_def["tables"][0]].copy(deep=True)
+    if len(table_def["tables"]) > 1:
+        for table in table_def["tables"][1:]:
+            df_this_table = pd.concat(
+                [df_this_table, all_tables[table]],
+                axis=0,
+                join='outer')
+
+    # fix for table ES3.6
+    if table_name == 'ES3.6':
+        col_idx = df_this_table[0] == "Total CO Emission"
+        df_this_table.loc[col_idx, 1:] = ''
+        df_this_table.loc[col_idx, 0] = 'Total CO2 Emission'
+
+    df_this_table = df_this_table.reset_index(drop=True)
+
+    # fix categories if necessary
+    if "fix_cats" in table_def.keys():
+        for col in table_def["fix_cats"]:
+            df_this_table[col] = df_this_table[col].replace(table_def["fix_cats"][col])
+
+    # fix rows
+    for col in table_def["rows_to_fix"].keys():
+        for n_rows in table_def["rows_to_fix"][col].keys():
+            print(f"Fixing {col}, {n_rows}")
+            # replace line breaks, long hyphens, double, and triple spaces in category names
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("–", "-")
+            df_this_table = fix_rows(df_this_table,
+                                     table_def["rows_to_fix"][col][n_rows], col, n_rows)
+
+    # split by entity
+    if "gas_splitting" in table_def.keys():
+        col_entity = [''] * len(df_this_table)
+        last_entity = ''
+        for i in range(0, len(df_this_table)):
+            current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
+            if current_header in table_def["gas_splitting"].keys():
+                last_entity = table_def["gas_splitting"][current_header]
+            col_entity[i] = last_entity
+
+        df_this_table["entity"] = col_entity
+        table_def["index_cols"].append("entity")
+
+    # make a wide table
+    df_this_table = make_wide_table(df_this_table, table_def["wide_keyword"],
+                                    table_def["col_wide_kwd"], table_def["index_cols"])
+
+    if "drop_rows" in table_def.keys():
+        df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
+
+    # reset row index
+    df_this_table = df_this_table.reset_index(drop=False)
+
+    # add entity
+    if "entity" in table_def.keys():
+        df_this_table["entity"] = table_def["entity"]
+
+    # add unit
+    df_this_table["unit"] = table_def["unit"]
+
+    df_this_table = df_this_table.rename({table_def["index_cols"][0]: "orig_cat_name"},
+                                         axis=1)
+
+    # print(table_def["index_cols"][0])
+    # print(df_this_table.columns.values)
+
+    # make a copy of the categories row
+    df_this_table["category"] = df_this_table["orig_cat_name"]
+
+    # replace cat names by codes in col "category"
+    # first the manual replacements
+    df_this_table["category"] = df_this_table["category"].replace(
+        table_def["cat_codes_manual"])
+    # then the regex replacements
+    repl = lambda m: m.group('UNFCCC_GHG_data')
+    df_this_table["category"] = df_this_table["category"].str.replace(cat_code_regexp,
+                                                                      repl, regex=True)
+
+    ### convert to PRIMAP2 IF
+    # remove ','
+    time_format = '%Y'
+    time_columns = [
+        col
+        for col in df_this_table.columns.values
+        if matches_time_format(col, time_format)
+    ]
+
+    for col in time_columns:
+        df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(',', '',
+                                                                          regex=False)
+
+    # drop orig_cat_name as it's not unique per category
+    df_this_table = df_this_table.drop(columns="orig_cat_name")
+
+    # coords_defaults_this_table = coords_defaults.copy()
+    # coords_defaults_this_table["unit"] = table_def["unit"]
+    df_this_table_if = pm2.pm2io.convert_wide_dataframe_if(
+        df_this_table,
+        coords_cols=coords_cols,
+        add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data
+    )
+
+    this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
+
+    if data_pm2 is None:
+        data_pm2 = this_table_pm2
+    else:
+        data_pm2 = data_pm2.pr.merge(this_table_pm2)
+
+# convert back to IF to have units in the fixed format
+data_if = data_pm2.pr.to_interchange_format()
+
+# ###
+# save data
+# ###
+# data in original categories
+pm2.pm2io.write_interchange_format(output_folder /
+                                   (output_filename + coords_terminologies["category"]),
+                                   data_if)
+encoding = {var: compression for var in data_pm2.data_vars}
+data_pm2.pr.to_netcdf((output_folder /
+                       (output_filename + coords_terminologies[
+                           "category"])).with_suffix(".nc"),
+                      encoding=encoding)
+
+
+# ###
+# convert to IPCC2006 categories
+# ###
+data_proc_pm2 = data_pm2.copy(deep=True)
+
+
+country_processing = {
+    'basket_copy': basket_copy,
+}
+
+data_proc_pm2 = process_data_for_country(
+    data_proc_pm2,
+    entities_to_ignore=[],
+    gas_baskets=gas_baskets,
+    processing_info_country=country_processing,
+    cat_terminology_out = terminology_proc,
+    category_conversion = cat_conversion,
+)
+
+# convert to IF
+data_proc_if = data_proc_pm2.pr.to_interchange_format()
+
+# ###
+# save data
+# ###
+# data in 2006 categories
+pm2.pm2io.write_interchange_format(output_folder /
+                                   (output_filename + "IPCC2006_PRIMAP"),
+                                   data_proc_if)
+encoding = {var: compression for var in data_proc_pm2.data_vars}
+data_proc_pm2.pr.to_netcdf((output_folder /
+                            (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
+                           encoding=encoding)

+ 2 - 1
UNFCCC_GHG_data/helper/__init__.py

@@ -8,7 +8,7 @@ from .definitions import GWP_factors, gas_baskets
 from .definitions import compression
 from .functions import get_country_code, get_country_name, convert_categories
 from .functions import create_folder_mapping, process_data_for_country, get_code_file
-from .functions import fix_rows
+from .functions import fix_rows, make_wide_table
 
 __all__ = [
     "root_path",
@@ -31,5 +31,6 @@ __all__ = [
     "create_folder_mapping",
     "process_data_for_country",
     "fix_rows",
+    "make_wide_table"
     "compression",
 ]

+ 39 - 1
UNFCCC_GHG_data/helper/functions.py

@@ -8,7 +8,7 @@ import pandas as pd
 import numpy as np
 from datetime import date
 from copy import deepcopy
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 from pathlib import Path
 from .definitions import custom_country_mapping, custom_folders
 from .definitions import root_path, downloaded_data_path, extracted_data_path
@@ -378,6 +378,7 @@ def convert_categories(
 ) -> xr.Dataset:
     """
     convert data from one category terminology to another
+    # TODO rewrite to use aggregate_coordinates functions
     """
     print(f"converting categories to {terminology_to}")
 
@@ -980,3 +981,40 @@ def fix_rows(
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
     return data
+
+
+def make_wide_table(
+        data: pd.DataFrame,
+        keyword: str,
+        col: Union[int, str],
+        index_cols: List[Union[int, str]]
+) -> pd.DataFrame:
+    index = data.loc[data[col] == keyword].index
+    if not list(index):
+        print("Keyword for table transformation not found")
+        return data
+    elif len(index)==1:
+        print("Keyword for table transformation found only once")
+        return data
+    else:
+        df_all = None
+        for i, item in enumerate(index):
+            loc = data.index.get_loc(item)
+            if i < len(index) - 1:
+                next_loc = data.index.get_loc(index[i + 1])
+            else:
+                next_loc = data.index[-1] + 1
+            df_to_add = data.loc[list(range(loc, next_loc))]
+            # select only cols which don't have NaN, Null, or '' as header
+            filter_nan = ((~df_to_add.iloc[0].isnull()) & (df_to_add.iloc[0] != 'NaN')& (df_to_add.iloc[0] != ''))
+            df_to_add = df_to_add.loc[: , filter_nan]
+            df_to_add.columns = df_to_add.iloc[0]
+            #print(df_to_add.columns)
+            df_to_add = df_to_add.drop(loc)
+            df_to_add = df_to_add.set_index(index_cols)
+
+            if df_all is None:
+                df_all = df_to_add
+            else:
+                df_all = pd.concat([df_all, df_to_add], axis=1, join='outer')
+        return df_all