Browse Source

Adapt UNFCCC-downloader to new UNFCCC website. Some fixes for Morocco, Montenegro, and Thailand BUR reading code

Johannes Gütschow 2 years ago
parent
commit
e80dcb0482

+ 34 - 26
code/UNFCCC_downloader/unfccc_submission_info.py

@@ -17,13 +17,16 @@ def get_unfccc_submission_info(
     info = []
     pattern = re.compile(r"BUR ?\d")
     i = 0
+    last_excep = None
     while i < max_tries:
         try:
             driver.get(url)
             html = BeautifulSoup(driver.page_source, "html.parser")
-            title = html.find("h1").contents[0]
+            subtree = html.find(class_="document-title")
+            title = subtree.find("span").contents[0]
             break
-        except (AttributeError, WebDriverException):
+        except (AttributeError, WebDriverException) as excep:
+            last_excep = excep
             print(f"Error fetching {url}")
             print("Retrying ...")
             time.sleep(randrange(5, 15))
@@ -31,7 +34,8 @@ def get_unfccc_submission_info(
             continue
 
     if i == max_tries:
-        print(f"Aborting after {max_tries} tries")
+        print(f"Aborting after {max_tries} tries.")
+        print(last_excep)
     else:
         match = pattern.search(title)
         if match:
@@ -39,31 +43,35 @@ def get_unfccc_submission_info(
         else:
             kind = None
 
-        h2 = html.find("h2", text="Versions")
-        if h2:
-            div = h2.findNext("div")
-            links = div.findAll("a")
-            try:
-                country = (
-                    html.find("h2", text="Countries").findNext("div").findNext("div").text
-                )
-            except AttributeError:
-                country = (
-                    html.find("h2", text="Corporate Author")
-                    .findNext("div")
-                    .findNext("div")
-                    .text
-                )
-            doctype = (
-                html.find("h2", text="Document Type").findNext("div").findNext("div").text
-            )
-            for link in links:
-                url = link.attrs["href"]
+        # TODO: might improve speed by first searching for class="document-line" and then operating on thie resulting subtree for the info
+        try:
+            subtree = html.find_all(
+                class_="field field--name-field-document-country field--type-termstore-entity-reference field--label-inline")
+            country = subtree[0].find(class_="field--item").contents[0]
+        except AttributeError:
+            # author as backup for country
+            subtree = html.find_all(class_="field--name-field-document-ca")
+            country = subtree[0].find(class_="field--item").contents[0]
+        # document type
+        subtree = html.find_all(
+            class_="field field--name-field-document-type field--type-termstore-entity-reference field--label-hidden field--items")
+        doctype = subtree[0].find(class_="field--item").contents[0]
+
+        # get files
+        sub_files = html.find(
+            class_=["form-select form-control", "form-select form-control download"])
+        files = sub_files.find_all("option", value=True)
+        files = [file.attrs['value'] for file in files]
+
+        if len(files) > 0:
+            for file in files:
                 if not kind:
-                    match = pattern.search(url.upper())
+                    match = pattern.search(file.upper())
                     if match:
                         kind = match.group(0)
                     else:
+                        # TODO: check why search in filename makes sense (compared to
+                        #  directly using doctype)
                         if ("CRF" in doctype) or ("CRF" in title):
                             kind = "CRF"
                         elif ("SEF" in doctype) or ("SEF" in title):
@@ -80,10 +88,10 @@ def get_unfccc_submission_info(
                     "Kind": kind,
                     "Country": country,
                     "Title": title,
-                    "URL": url,
+                    "URL": file,
                 })
 
-            print("\t".join([kind, country, title, url]))
+                print("\t".join([kind, country, title, file]))
         else:
             print(f"No files found for {url}")
 

+ 1 - 3
code/UNFCCC_reader/Montenegro/config_MNE_BUR3.py

@@ -56,10 +56,8 @@ aggregate_cats = {
     '3.C.1': {'sources': ['3.C.1.c', '3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
     '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.C.45AG', '3.C.7'],
             'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.1.b', '3.C.3', 'M.3.C.45AG', '3.C.7'],
+    'M.3.C.AG': {'sources': ['3.C.1', '3.C.3', 'M.3.C.45AG', '3.C.7'],
             'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    'M.3.C.LU': {'sources': ['3.C.1.c'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Land use)'},
     '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
     'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
 }

+ 5 - 10
code/UNFCCC_reader/Montenegro/read_MNE_BUR3_from_pdf.py

@@ -45,13 +45,6 @@ regex_entity = r"^(.*)\s\("
 gwp_to_use = 'AR4GWP100'
 
 # conversion to PRIMAP2 format
-# manual category codes
-cat_codes_manual = { # transform to PRIMAP1 form. PRIMAP2 form in next step with other codes
-    'International bunkers': 'MBK',
-    'Marine': 'MBKM',
-    'Aviation': 'MBKA',
-    'Multilateral operations': 'MMULTIOP',
-}
 
 coords_terminologies = {
     "area": "ISO3",
@@ -69,9 +62,9 @@ coords_defaults = {
 coords_value_mapping = {
     'unit': 'PRIMAP1',
     'entity': {
-        f"GHG {gwp_to_use}": f"KYOTOGHG {gwp_to_use}",
-        f"HFC {gwp_to_use}": f"HFCS {gwp_to_use}",
-        f"PFC {gwp_to_use}": f"PFCS {gwp_to_use}",
+        f"GHG ({gwp_to_use})": f"KYOTOGHG ({gwp_to_use})",
+        f"HFC ({gwp_to_use})": f"HFCS ({gwp_to_use})",
+        f"PFC ({gwp_to_use})": f"PFCS ({gwp_to_use})",
     },
     'category': {
         'Total national GHG emissions (with LULUCF)': '0',
@@ -80,6 +73,8 @@ coords_value_mapping = {
         '1.A.3.a.i': 'M.BK.A',
         '1.A.3.d.i': 'M.BK.M',
         'CO2 from Biomass Combustion for Energy Production': 'M.BIO',
+        '6 Other': '6',
+        '2 H': '2.H',
     },
 }
 

+ 137 - 0
code/UNFCCC_reader/Morocco/config_MAR_BUR3.py

@@ -0,0 +1,137 @@
+# define which raw tables to combine
+table_defs = {
+    2010: {
+        'Energy': [0, 1],
+        'Agriculture': [10],
+        'IPPU': [15, 16, 17],
+        'LULUCF': [30],
+        'Waste': [35],
+    },
+    2012: {
+        'Energy': [2, 3],
+        'Agriculture': [11],
+        'IPPU': [18, 19, 20],
+        'LULUCF': [31],
+        'Waste': [36],
+    },
+    2014: {
+        'Energy': [4, 5],
+        'Agriculture': [10],
+        'IPPU': [21, 22, 23],
+        'LULUCF': [32],
+        'Waste': [37],
+    },
+    2016: {
+        'Energy': [6, 7],
+        'Agriculture': [10],
+        'IPPU': [24, 25, 26],
+        'LULUCF': [33],
+        'Waste': [38],
+    },
+    2018: {
+        'Energy': [8, 9],
+        'Agriculture': [14],
+        'IPPU': [27, 28, 29],
+        'LULUCF': [34],
+        'Waste': [39],
+    },
+}
+
+header_defs = {
+    'Energy': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
+        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
+    'Agriculture': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
+        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
+    'IPPU': [['Catégories', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'COVNM', 'SO2'],
+        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
+    'LULUCF': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
+        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
+    'Waste': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
+        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
+}
+
+remove_cats = ['3.A.4', '3.B', '3.B.4', '1.B.2.a', '1.B.2.b', '1.B.2.c']
+
+cat_mapping = {
+    "1.B.2.a.4": "1.B.2.a.iii.4",
+    "1.B.2.a.5": "1.B.2.a.iii.5",
+    "1.B.2.a.6": "1.B.2.a.iii.6",
+    "1.B.2.b.2": "1.B.2.b.iii.2",
+    "1.B.2.b.4": "1.B.2.b.iii.4",
+    "1.B.2.b.5": "1.B.2.b.iii.5",
+    "1.B.2.b.6": "1.B.2.b.iii.6",
+    "1.B.2.c.1": "1.B.2.b.i", # simplification, split to oil and gas ("1.B.2.X.i")
+    "1.B.2.c.2": "1.B.2.b.ii", # simplification, split to oil and gas ("1.B.2.X.ii")
+    '1.A.2.g': '1.A.2.m', # other industry
+    '3.A': '3.A.1', # enteric fermentation
+    '3.A.1': '3.A.1.a', # cattle
+    '3.A.1.a': '3.A.1.a.i',
+    '3.A.1.b': '3.A.1.a.ii',
+    '3.A.2': '3.A.1.c',
+    '3.A.3': '3.A.1.h', # Swine
+    '3.A.4.a': '3.A.1.d', # goats
+    '3.A.4.b': '3.A.1.e', # camels
+    '3.A.4.c': '3.A.1.f', # horses
+    '3.A.4.d': '3.A.1.g', # Mules and asses
+    '3.A.4.e': '3.A.1.i', # poultry
+#    '3.B': '3.A.2', # Manure Management
+    '3.B.1': '3.A.2.a', # cattle
+    '3.B.1.a': '3.A.2.a.i',
+    '3.B.1.b': '3.A.2.a.ii',
+    '3.B.2': '3.A.2.c', # Sheep
+    '3.B.3': '3.A.2.h', # Swine
+    '3.B.4.a': '3.A.2.d', # Goats
+    '3.B.4.b': '3.A.2.e', # Camels
+    '3.B.4.c': '3.A.2.f', # Horses
+    '3.B.4.d': '3.A.2.g', # Mules and Asses
+    '3.B.4.e': '3.A.2.i', # Poultry
+    '3.B.5': '3.C.6', # indirect N2O from manure management
+    '3.C': '3.C.7', # rice
+    '3.D': 'M.3.C.45AG', # Agricultural soils
+    '3.D.a': '3.C.4', #direct N2O from agri soils
+    '3.D.a.1': '3.C.4.a', # inorganic fertilizers
+    '3.D.a.2': '3.C.4.b', # organic fertilizers
+    '3.D.a.3': '3.C.4.c', # urine and dung by grazing animals
+    '3.D.a.4': '3.C.4.d', # N in crop residues
+    '3.D.b': '3.C.5', # indirect N2O from managed soils
+    '3.D.b.1': '3.C.5.a', # Atmospheric deposition
+    '3.D.b.2': '3.C.5.b', # nitrogen leeching and runoff
+    '3.H': '3.C.3', # urea application
+    'LU.3.B.1': '3.B.1', # forest
+    'LU.3.B.2': '3.B.2', # cropland
+    'LU.3.B.3': '3.B.3', # grassland
+    'LU.3.B.4': '3.B.4', # wetland
+    'LU.3.B.5': '3.B.5', # Settlements
+    'LU.3.B.6': '3.B.6', # other land
+}
+
+aggregate_cats = {
+    '1.B.2.a.iii': {'sources': ['1.B.2.a.iii.4', '1.B.2.a.iii.5', '1.B.2.a.iii.6'],
+                    'name': 'All Other'},
+    '1.B.2.b.iii': {'sources': ['1.B.2.b.iii.2', '1.B.2.b.iii.4', '1.B.2.b.iii.5',
+                                '1.B.2.b.iii.6',],
+                    'name': 'All Other'},
+    '1.B.2.a': {'sources': ['1.B.2.a.iii'], 'name': 'Oil'},
+    '1.B.2.b': {'sources': ['1.B.2.b.i', '1.B.2.b.ii', '1.B.2.b.iii'],
+                'name': 'Natural Gas'},
+    '2.D':  {'sources': ['2.D.4'], 'name': 'Non-Energy Products from Fuels and Solvent Use'},
+    '2.F.1':  {'sources': ['2.F.1.a', '2.F.1.b'], 'name': 'Refrigeration and Air Conditioning'},
+    '2.F':  {'sources': ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
+             'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
+    '2.H':  {'sources': ["2.H.1", "2.H.2", "2.H.3"], 'name': 'Other'},
+    '3.A.2': {'sources': ['3.A.2.a', '3.A.2.c', '3.A.2.d', '3.A.2.e', '3.A.2.f',
+                          '3.A.2.g', '3.A.2.h', '3.A.2.i'],
+              'name': 'Manure Management'},
+    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
+    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
+    '3.C': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
+            'name': 'Aggregate sources and non-CO2 emissions sources on land'},
+    'M.3.C.AG': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
+            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
+    'M.AG': {'sources': ['3.A', 'M.3.C.AG'], 'name': 'Agriculture'},
+    '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
+}
+
+zero_cats = ['1.B.2.a.i', '1.B.2.a.ii'] # venting and flaring with 0 for oil as
+# all mapped to natural gas

+ 122 - 71
code/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -4,8 +4,11 @@
 import camelot
 import primap2 as pm2
 import pandas as pd
-import numpy as np
+import copy
 from pathlib import Path
+from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
+    table_defs, header_defs
+from primap2.pm2io._data_reading import matches_time_format, filter_data
 
 # ###
 # configuration
@@ -15,12 +18,8 @@ root_path = root_path.resolve()
 downloaded_data_path = root_path / "downloaded_data"
 extracted_data_path = root_path / "extracted_data"
 
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
-if not output_folder.exists():
-    output_folder.mkdir()
-
 output_filename = 'MAR_BUR3_2022_'
 
 inventory_file = 'Morocco_BUR3_Fr.pdf'
@@ -33,79 +32,29 @@ pages_to_read = range(104, 138)
 
 compression = dict(zlib=True, complevel=9)
 
-header_defs = {
-    'Energy': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Agriculture': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'IPPU': [['Catégories', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'LULUCF': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Waste': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-}
-
-# define which raw tables to combine
-table_defs = {
-    2010: {
-        'Energy': [0, 1],
-        'Agriculture': [10],
-        'IPPU': [15, 16, 17],
-        'LULUCF': [30],
-        'Waste': [35],
-    },
-    2012: {
-        'Energy': [2, 3],
-        'Agriculture': [11],
-        'IPPU': [18, 19, 20],
-        'LULUCF': [31],
-        'Waste': [36],
-    },
-    2014: {
-        'Energy': [4, 5],
-        'Agriculture': [10],
-        'IPPU': [21, 22, 23],
-        'LULUCF': [32],
-        'Waste': [37],
-    },
-    2016: {
-        'Energy': [6, 7],
-        'Agriculture': [10],
-        'IPPU': [24, 25, 26],
-        'LULUCF': [33],
-        'Waste': [38],
-    },
-    2018: {
-        'Energy': [8, 9],
-        'Agriculture': [14],
-        'IPPU': [27, 28, 29],
-        'LULUCF': [34],
-        'Waste': [39],
-    },
-}
-
 # special header as category code and name in one column
 header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
 index_cols = ['Catégories']
 
 # rows to remove
-cats_remove = []
+cats_remove = [
+    'Agriculture' # always empty
+]
 
 # manual category codes
 cat_codes_manual = {
     '1.A.2.e -Industries agro-alimentaires et du tabac': '1.A.2.e',
     '1.A.2.f -Industries des minéraux non- métalliques': '1.A.2.f',
-    'Agriculture': 'M.AG',
+    #'Agriculture': 'M.AG',
     '2. PIUP': '2',
     'UTCATF': 'M.LULUCF',
-    '3.B.1 Terres forestières': '3.B.1',
-    '3.B.2 Terres cultivées': '3.B.2',
-    '3.B.3 Prairies': '3.B.3',
-    '3.B.4 Terres humides': '3.B.4',
-    '3.B.5 Etablissements': '3.B.5',
-    '3.B.6 Autres terres': '3.B.6',
+    '3.B.1 Terres forestières': 'LU.3.B.1',
+    '3.B.2 Terres cultivées': 'LU.3.B.2',
+    '3.B.3 Prairies': 'LU.3.B.3',
+    '3.B.4 Terres humides': 'LU.3.B.4',
+    '3.B.5 Etablissements': 'LU.3.B.5',
+    '3.B.6 Autres terres': 'LU.3.B.6',
     '1.B.1.a.i.1 -Exploitation minière': '1.A.1.a.i.1',
 }
 
@@ -113,7 +62,7 @@ cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*'
 
 coords_terminologies = {
     "area": "ISO3",
-    "category": "IPCC2006_PRIMAP",
+    "category": "IPCC1996_2006_MAR_Inv",
     "scenario": "PRIMAP",
 }
 
@@ -140,9 +89,9 @@ coords_cols = {
     "unit": "unit"
 }
 
-add_coords_cols = {
-    "orig_cat_name": ["orig_cat_name", "category"],
-}
+#add_coords_cols = {
+#    "orig_cat_name": ["orig_cat_name", "category"],
+#}
 
 filter_remove = {
     "f1": {
@@ -184,6 +133,13 @@ for year in table_defs.keys():
         df_this_table = df_this_table.drop(df_this_table.iloc[0:2].index)
         df_this_table.columns = header_defs[sector]
 
+        # fix 2018 agri table
+        if (year == 2018) & (sector == "Agriculture"):
+            last_shift_row = 25
+            df_temp = df_this_table.iloc[0: last_shift_row, 1:].copy()
+            df_this_table.iloc[0, 1:] = ''
+            df_this_table.iloc[1: last_shift_row + 1, 1:] = df_temp
+
         # replace line breaks, long hyphens, double, and triple spaces in category names
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
@@ -221,7 +177,7 @@ df_all = df_all.reset_index(drop=True)
 
 # prepare numbers for pd.to_numeric
 df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ', '')
-repl = lambda m: m.group('part1') + m.group('part2')
+repl = lambda m: m.group('part1') + '.' +  m.group('part2')
 df_all.loc[:, 'data'] = df_all.loc[:, 'data'].str.replace(
     '(?P<part1>[0-9]+),(?P<part2>[0-9\.]+)$', repl, regex=True)
 df_all['data'][df_all['data'].isnull()] = 'NaN'
@@ -231,6 +187,9 @@ for entity in df_all["entity"].unique():
     df_all["entity"][(df_all["entity"] == entity) & (
                 df_all["unit"] == "GgCO2eq")] = f"{entity} ({gwp_to_use})"
 
+# drop "original_cat_name" as it has non-unique values per category
+df_all = df_all.drop(columns="orig_cat_name")
+
 data_if = pm2.pm2io.convert_long_dataframe_if(
     df_all,
     coords_cols=coords_cols,
@@ -265,9 +224,94 @@ data_pm2 = data_pm2.drop_vars(entities_to_convert)
 # convert back to IF to have units in the fixed format
 data_if = data_pm2.pr.to_interchange_format()
 
-##### save data to IF and native format ####
+# ###
+# convert to IPCC2006 categories
+# ###
+data_if_2006 = copy.deepcopy(data_if)
+data_if_2006.attrs = copy.deepcopy(data_if.attrs)
+
+filter_remove_cats = {
+    "cat": {
+        f"category ({coords_terminologies['category']})":
+    remove_cats
+    },
+}
+
+filter_data(data_if_2006, filter_remove=filter_remove_cats)
+
+# map categories
+data_if_2006 = data_if_2006.replace(
+    {f"category ({coords_terminologies['category']})": cat_mapping})
+data_if_2006[f"category ({coords_terminologies['category']})"].unique()
+
+# rename the category col
+data_if_2006.rename(columns={
+    f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
+                    inplace=True)
+data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
+data_if_2006.attrs['dimensions']['*'] = [
+    'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
+    else item for item in data_if_2006.attrs['dimensions']['*']]
+# aggregate categories
+time_format = '%Y'
+time_columns = [
+    col
+    for col in data_if_2006.columns.values
+    if matches_time_format(col, time_format)
+]
+
+for cat_to_agg in aggregate_cats:
+    mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
+        aggregate_cats[cat_to_agg]["sources"])
+    df_test = data_if_2006[mask]
+    # print(df_test)
+
+    if len(df_test) > 0:
+        print(f"Aggregating category {cat_to_agg}")
+        df_combine = df_test.copy(deep=True)
+
+        for col in time_columns:
+            df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
+
+        df_combine = df_combine.groupby(
+            by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
+                'unit']).sum(min_count=1)
+
+        df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
+        # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+        # df_combine.insert(2, "orig_cat_name", "computed")
+
+        df_combine = df_combine.reset_index()
+
+        data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+        data_if_2006 = data_if_2006.reset_index(drop=True)
+    else:
+        print(f"no data to aggregate category {cat_to_agg}")
+
+for cat in zero_cats:
+    entities = data_if_2006["entity"].unique()
+    data_zero = data_if_2006[data_if_2006["category (IPCC2006_PRIMAP)"]=="1"].copy(
+        deep=True)
+    data_zero["category (IPCC2006_PRIMAP)"] = cat
+    for col in time_columns:
+        data_zero[col] = 0
+
+    data_if_2006 = pd.concat([data_if_2006, data_zero])
+
+# conversion to PRIMAP2 native format
+data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
+
+# convert back to IF to have units in the fixed format
+data_if_2006 = data_pm2_2006.pr.to_interchange_format()
+
+
+# ###
+# save data to IF and native format
+# ###
 if not output_folder.exists():
     output_folder.mkdir()
+
+# data in original categories
 pm2.pm2io.write_interchange_format(
     output_folder / (output_filename + coords_terminologies["category"]), data_if)
 
@@ -276,3 +320,10 @@ data_pm2.pr.to_netcdf(
     output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
     encoding=encoding)
 
+# data in 2006 categories
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+
+encoding = {var: compression for var in data_pm2_2006.data_vars}
+data_pm2_2006.pr.to_netcdf(
+    output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)

+ 4 - 0
code/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -304,6 +304,10 @@ data_indirect_pm2 = pm2.pm2io.from_interchange_format(data_indirect_IF)
 
 data_all = data_inventory_pm2.pr.merge(data_main_sector_ts_pm2)
 data_all = data_all.pr.merge(data_indirect_pm2)
+
+# combine CO2 emissions and absorptions
+data_all["CO2"] = data_all['CO2 removals'] + data_all['CO2 emissions']
+
 data_all_if = data_all.pr.to_interchange_format()
 
 

+ 1 - 1
downloaded_data/UNFCCC/submissions-annexI_2022.csv

@@ -1 +1 @@
-../../.git/annex/objects/wm/95/MD5E-s1--68b329da9893e34099c7d8ad5cb9c940.csv/MD5E-s1--68b329da9893e34099c7d8ad5cb9c940.csv
+../../.git/annex/objects/Vm/X3/MD5E-s28534--a0b2bc09b840b25b6c24806403087be8.csv/MD5E-s28534--a0b2bc09b840b25b6c24806403087be8.csv