Browse Source

add code for Mauritania BUR2, fix problems in DI configuration

Johannes Gütschow 9 months ago
parent
commit
48a3ba0c7a

+ 1 - 0
pyproject.toml

@@ -130,6 +130,7 @@ docstring-code-format = true
 ]
 "src/unfccc_ghg_data/unfccc_reader/*/config_*.py" = [
     "E501",  # don't enforce line length
+    "RUF001",  #
 ]
 "src/unfccc_ghg_data/unfccc_crf_reader/crf_specifications/*_specification.py" = [
     "E501",  # don't enforce line length

+ 4 - 5
src/unfccc_ghg_data/helper/functions.py

@@ -173,9 +173,7 @@ def process_data_for_country(  # noqa PLR0913, PLR0912, PLR0915
                 remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
                 entities = remove_info.pop("entities")
                 for entity in entities:
-                    data_country[entity].pr.loc[remove_info] = (
-                        data_country[entity].pr.loc[remove_info] * np.nan
-                    )
+                    data_country[entity].pr.loc[remove_info] *= np.nan
 
         # remove all data for given years if necessary
         if "remove_years" in processing_info_country:
@@ -270,7 +268,7 @@ def process_data_for_country(  # noqa PLR0913, PLR0912, PLR0915
             warnings.warn(
                 'The "aggregate_cats" flag is deprecated and will '
                 "be removed in a future version. Please use "
-                '"aggregate_coord" with key "category" instead',
+                '"aggregate_coords" with key "category" instead',
                 category=DeprecationWarning,
             )
             print(
@@ -293,7 +291,7 @@ def process_data_for_country(  # noqa PLR0913, PLR0912, PLR0915
                 min_count=1,
             )
 
-        if "aggregate_coord" in processing_info_country:
+        if "aggregate_coords" in processing_info_country:
             print(
                 f"Aggregating data for country {country_code}, source {source}, "
                 f"scenario {scenario}"
@@ -942,6 +940,7 @@ def fix_rows(
         new_row = new_row.str.replace("N O", "NO")
         new_row = new_row.str.replace(", N", ",N")
         new_row = new_row.str.replace("- ", "-")
+        new_row = new_row.str.strip()
         # replace spaces in numbers
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
 

+ 35 - 25
src/unfccc_ghg_data/unfccc_di_reader/unfccc_di_reader_config.py

@@ -1989,18 +1989,18 @@ di_processing_templates = {
     # MKD:
     "MKD": {
         "DI2023-05-24": {  # 1990-2009
-            "downscale": {
-                "entities": {
-                    "FGASES": {
-                        "basket": f"FGASES ({gwp_to_use})",
-                        "basket_contents": [f"HFCS ({gwp_to_use})"],
-                    },
-                    "HFC": {
-                        "basket": f"HFCS ({gwp_to_use})",
-                        "basket_contents": [f"UnspMixOfHFCs ({gwp_to_use})"],
-                    },
-                },
-            },
+            # "downscale": {
+            #     "entities": {
+            #         "FGASES": {
+            #             "basket": f"FGASES ({gwp_to_use})",
+            #             "basket_contents": [f"HFCS ({gwp_to_use})"],
+            #         },
+            #         "HFC": {
+            #             "basket": f"HFCS ({gwp_to_use})",
+            #             "basket_contents": [f"UnspMixOfHFCs ({gwp_to_use})"],
+            #         },
+            #     },
+            # },
             "basket_copy": {
                 "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
                 "entities": ["UnspMixOfHFCs"],
@@ -2020,16 +2020,16 @@ di_processing_templates = {
                         "sel": {"time": ["1995", "2000"]},
                     },
                 },
-                "entities": {
-                    "FGASES": {
-                        "basket": f"FGASES ({gwp_to_use})",
-                        "basket_contents": [f"HFCS ({gwp_to_use})"],
-                    },
-                    "HFC": {
-                        "basket": f"HFCS ({gwp_to_use})",
-                        "basket_contents": [f"UnspMixOfHFCs ({gwp_to_use})"],
-                    },
-                },
+                # "entities": {
+                #     "FGASES": {
+                #         "basket": f"FGASES ({gwp_to_use})",
+                #         "basket_contents": [f"HFCS ({gwp_to_use})"],
+                #     },
+                #     "HFC": {
+                #         "basket": f"HFCS ({gwp_to_use})",
+                #         "basket_contents": [f"UnspMixOfHFCs ({gwp_to_use})"],
+                #     },
+                # },
             },
             "basket_copy": {
                 "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
@@ -2268,13 +2268,23 @@ di_processing_templates = {
             },
             "remove_ts": {
                 "M.AG.ELV": {
-                    "category": ["4", "4.D", "4.E", "4.F", "15163", "24540"],
-                    "entities": ["N2O", f"KYOTOGHG ({gwp_to_use})"],
+                    "category": [
+                        "4",
+                        "4.A",
+                        "4.B",
+                        "4.C",
+                        "4.D",
+                        "4.E",
+                        "4.F",
+                        "15163",
+                        "24540",
+                    ],
+                    "entities": ["N2O", f"KYOTOGHG ({gwp_to_use})", "CH4", "NOx", "CO"],
                     "time": ["1993"],
                 },
             },
         },
-    },
+    },  # TODO: inconsistency through removed data for KYOTOGHG (SARGWP100)
     # TGO: more data in BUR / NIR, 1992-1998, 2000, 2005, 2010, 2013-2018 (
     # downscaling needed for some years, inconsistent detail)
     # THA: 1994 (2000-2013, extensive downscaling needed for 2000-2012).

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Mauritania/__init__.py

@@ -0,0 +1,30 @@
+"""Read Mauritania's BURs, NIRs, NCs
+
+Scripts and configurations to read Mauritania's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MRT'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MRT
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 803 - 0
src/unfccc_ghg_data/unfccc_reader/Mauritania/config_mrt_bur2.py

@@ -0,0 +1,803 @@
+"""Config for Mauritania BUR2
+
+Configuration for reading the Mauritania's BUR2 from pdf.
+Full configuration is contained here including configuraton for conversions to
+primap2 data format.
+
+Not nicely structured, just copied from old script as this was integrated under time
+pressure to fix a GWP bug (wrong GWP stated in report)
+"""
+
+gwp_to_use = "AR4GWP100"
+terminology_proc = "IPCC2006_PRIMAP"
+
+table_defs = {
+    1990: [[0], [1], [2], [21], [28, 29], [42]],
+    1995: [[3], [4], [5], [22], [30, 31], [43]],
+    2000: [[6], [7], [8], [23], [32, 33], [44]],
+    2010: [[9], [10], [11], [24], [34, 35], [45]],
+    2012: [[12], [13], [14], [25], [36, 37], [46]],
+    2015: [[15], [16], [17], [26], [38, 39], [47]],
+    2018: [[18], [19], [20], [27], [40, 41], [48]],
+}
+
+page_def_templates = {
+    "24": {
+        "area": ["51,745,579,87"],
+        "cols": ["309,344,386,429,464,494,535"],
+    },
+    "odd": {  # 25, 27, 29, 31, 33, 35, 37
+        "area": ["51,745,551,244", "55,231,554,118"],
+        "cols": ["276,316,361,403,438,468,509", "276,319,361,407,441,472,511"],
+    },
+    "even": {  # 26, 28, 30, 32,34, 36
+        "area": ["51,745,579,87"],
+        "cols": ["304,344,386,429,464,494,535"],
+    },
+    "25": {  # 27, 29, 31, 33, 35, 37
+        "area": ["51,745,551,244", "55,231,554,118"],
+        "cols": ["276,316,361,403,438,468,509", "276,319,361,407,441,472,511"],
+    },
+    "26": {  # 28, 30, 32,34, 36
+        "area": ["51,745,579,87"],
+        "cols": ["309,344,386,429,464,494,535"],
+    },
+    "38": {
+        "area": ["33,749,566,54"],
+        "cols": ["220,243,263,283,308,336,359,415,471,493,517,546"],
+    },
+    "39": {
+        "area": ["32,749,577,54"],
+        "cols": ["224,254,275,294,320,345,367,426,482,503,525,553"],
+    },
+    "40": {
+        "area": ["32,749,577,54"],
+        "cols": ["224,245,265,287,314,338,360,420,476,496,518,546"],
+    },
+    "41": {  # 42
+        "area": ["32,749,577,54"],
+        "cols": ["220,245,265,287,314,338,360,420,476,496,518,546"],
+    },
+    "43": {
+        "area": ["32,749,577,54"],
+        "cols": ["220,245,268,287,314,338,360,420,476,496,518,546"],
+    },
+    "44": {
+        "area": ["32,749,577,54"],
+        "cols": ["220,245,268,283,314,338,360,420,476,496,518,546"],
+    },
+    "45": {
+        "area": ["66,716,556,49"],
+        "cols": ["287,362,399,441,479,515"],
+    },
+    "46": {
+        "area": ["68,779,554,715", "68,677,554,52"],
+        "cols": ["287,362,399,441,479,515", "308,387,423,453,480,510"],
+    },
+    "47": {
+        "area": ["68,779,556,670", "67,640,555,48"],
+        "cols": ["308,387,423,453,480,510", "308,387,423,453,480,510"],
+    },
+    "48": {
+        "area": ["67,778,552,639", "67,610,553,49"],
+        "cols": ["308,387,423,453,480,510", "308,387,423,453,480,510"],
+    },
+    "49": {
+        "area": ["67,778,552,609", "67,579,553,49"],
+        "cols": ["308,387,423,453,480,510", "308,387,423,453,480,510"],
+    },
+    "50": {
+        "area": ["67,778,552,578", "67,550,553,49"],
+        "cols": ["308,387,423,453,480,510", "308,387,423,453,480,510"],
+    },
+    "51": {
+        "area": ["67,778,552,549"],
+        "cols": ["308,387,423,453,480,510"],
+    },
+    "52": {
+        "area": ["67,753,549,54"],
+        "cols": ["308,387,423,453,480,510"],
+    },
+    "53": {
+        "area": ["68,779,556,737"],
+        "cols": ["308,387,423,453,480,510"],
+    },
+    "54": {
+        "area": ["56,751,565,616", "56,587,565,449", "56,419,565,252", "56,217,565,74"],
+        "cols": [
+            "282,315,346,412,447,482,528",
+            "282,315,346,412,447,482,528",
+            "282,315,346,412,447,482,528",
+            "282,315,346,412,447,482,528",
+        ],
+    },
+    "55": {
+        "area": ["56,752,565,600", "56,563,565,408", "56,369,565,216"],
+        "cols": [
+            "282,315,346,412,447,482,528",
+            "282,315,346,412,447,482,528",
+            "282,315,346,412,447,482,528",
+        ],
+    },
+}
+
+header_templates = {
+    "24": {  # tables 0:20, 42: end
+        "entity": ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOCs", "SO2"],
+        "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
+        "rows": 2,
+    },
+    "38": {  # tables 21:27
+        "entity": [
+            "Catégories",
+            "CO2",
+            "CH4",
+            "N2O",
+            "HFCs",
+            "PFCs",
+            "SF6",
+            "Autres gaz halogénés avec facteurs de conversion équivalent CO2",
+            "Autres gaz halogénés sans facteurs de conversion équivalent CO2",
+            "NOx",
+            "CO",
+            "NMVOCs",
+            "SO2",
+        ],
+        "unit": [
+            "",
+            "Gg",
+            "Gg",
+            "Gg",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+        ],
+        "rows": 7,
+    },
+    "45": {  # tables 28:41
+        "entity": [
+            "Catégories",
+            "Émissions/ absorptions CO2",
+            "CH4",
+            "N2O",
+            "NOx",
+            "CO",
+            "COVNM",
+        ],
+        "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
+        "rows": 4,
+    },
+    "54": {  # tables 42:
+        "entity": ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOCs", "SO2"],
+        "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
+        "rows": 3,
+    },
+}
+
+fix_rows_template = {
+    "24": {
+        2: [
+            "1.A.1.c   Transformation des combustibles solides et autres industries",
+        ],
+    },
+    "25_1": {
+        2: [
+            "1.B.1.a.i.2  Emissions de gaz des couches lors des activités",
+            "1.B.1.a.i.4 Combustion du méthane asséché ou",
+            "1.B.1.a.ii.2  Emissions de gaz des couches lors des",
+        ],
+    },
+    "25_2": {
+        2: [
+            "Émissions de CO2 imputables à la combustion de labiomasse pour",
+        ],
+    },
+    "26": {
+        2: [
+            "1.A.1.c   Transformation des combustibles solides et autres industries",
+            "1.A.2.i Industries extractives (à l’exclusion de l’extraction de",
+        ],
+    },
+    "38": {
+        2: [
+            "2.D   Produits non énergétiques imputables aux",
+            "2.F   Utilisations de produits comme substituts de",
+        ],
+    },
+    "39": {
+        2: [
+            "2.D   Produits non énergétiques imputables aux combustibles",
+            "2.F   Utilisations de produits comme substituts de substances",
+        ],
+    },
+    "44": {
+        -2: [
+            "2  PROCÉDÉS INDUSTRIELS ET UTIL. DES PRODUITS",
+        ],
+        2: [
+            "2.D   Produits non énergétiques imputables aux",
+            "2.F   Utilisations de produits comme substituts de",
+        ],
+    },
+}
+
+table_reading_defs = {
+    0: {
+        "page": "24",
+        "table": 0,
+        "page_def": page_def_templates["24"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["24"],
+    },
+    1: {
+        "page": "25",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    2: {
+        "page": "25",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    3: {
+        "page": "26",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    4: {
+        "page": "27",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    5: {
+        "page": "27",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    6: {
+        "page": "28",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    7: {
+        "page": "29",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    8: {
+        "page": "29",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    9: {
+        "page": "30",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    10: {
+        "page": "31",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    11: {
+        "page": "31",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    12: {
+        "page": "32",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    13: {
+        "page": "33",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    14: {
+        "page": "33",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    15: {
+        "page": "34",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    16: {
+        "page": "35",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    17: {
+        "page": "35",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    18: {
+        "page": "36",
+        "table": 0,
+        "page_def": page_def_templates["even"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["26"],
+    },
+    19: {
+        "page": "37",
+        "table": 0,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_1"],
+    },
+    20: {
+        "page": "37",
+        "table": 1,
+        "page_def": page_def_templates["odd"],
+        "header": header_templates["24"],
+        "fix_rows": fix_rows_template["25_2"],
+    },
+    21: {
+        "page": "38",
+        "table": 0,
+        "page_def": page_def_templates["38"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["38"],
+    },
+    22: {
+        "page": "39",
+        "table": 0,
+        "page_def": page_def_templates["39"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["39"],
+    },
+    23: {
+        "page": "40",
+        "table": 0,
+        "page_def": page_def_templates["40"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["38"],
+    },
+    24: {
+        "page": "41",
+        "table": 0,
+        "page_def": page_def_templates["41"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["38"],
+    },
+    25: {
+        "page": "42",
+        "table": 0,
+        "page_def": page_def_templates["41"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["38"],
+    },
+    26: {
+        "page": "43",
+        "table": 0,
+        "page_def": page_def_templates["43"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["38"],
+    },
+    27: {
+        "page": "44",
+        "table": 0,
+        "page_def": page_def_templates["44"],
+        "header": header_templates["38"],
+        "fix_rows": fix_rows_template["44"],
+    },
+    28: {
+        "page": "45",
+        "table": 0,
+        "page_def": page_def_templates["45"],
+        "header": header_templates["45"],
+    },
+    29: {
+        "page": "46",
+        "table": 0,
+        "page_def": page_def_templates["46"],
+        "header": header_templates["45"],
+    },
+    30: {
+        "page": "46",
+        "table": 1,
+        "page_def": page_def_templates["46"],
+        "header": header_templates["45"],
+    },
+    31: {
+        "page": "47",
+        "table": 0,
+        "page_def": page_def_templates["47"],
+        "header": header_templates["45"],
+    },
+    32: {
+        "page": "47",
+        "table": 1,
+        "page_def": page_def_templates["47"],
+        "header": header_templates["45"],
+    },
+    33: {
+        "page": "48",
+        "table": 0,
+        "page_def": page_def_templates["48"],
+        "header": header_templates["45"],
+    },
+    34: {
+        "page": "48",
+        "table": 1,
+        "page_def": page_def_templates["48"],
+        "header": header_templates["45"],
+    },
+    35: {
+        "page": "49",
+        "table": 0,
+        "page_def": page_def_templates["49"],
+        "header": header_templates["45"],
+    },
+    36: {
+        "page": "49",
+        "table": 1,
+        "page_def": page_def_templates["49"],
+        "header": header_templates["45"],
+    },
+    37: {
+        "page": "50",
+        "table": 0,
+        "page_def": page_def_templates["50"],
+        "header": header_templates["45"],
+    },
+    38: {
+        "page": "50",
+        "table": 1,
+        "page_def": page_def_templates["50"],
+        "header": header_templates["45"],
+    },
+    39: {
+        "page": "51",
+        "table": 0,
+        "page_def": page_def_templates["51"],
+        "header": header_templates["45"],
+    },
+    40: {
+        "page": "52",
+        "table": 0,
+        "page_def": page_def_templates["52"],
+        "header": header_templates["45"],
+    },
+    41: {
+        "page": "53",
+        "table": 0,
+        "page_def": page_def_templates["53"],
+        "header": header_templates["45"],
+    },
+    42: {
+        "page": "54",
+        "table": 0,
+        "page_def": page_def_templates["54"],
+        "header": header_templates["54"],
+    },
+    43: {
+        "page": "54",
+        "table": 1,
+        "page_def": page_def_templates["54"],
+        "header": header_templates["54"],
+    },
+    44: {
+        "page": "54",
+        "table": 2,
+        "page_def": page_def_templates["54"],
+        "header": header_templates["54"],
+    },
+    45: {
+        "page": "54",
+        "table": 2,
+        "page_def": page_def_templates["54"],
+        "header": header_templates["54"],
+    },
+    46: {
+        "page": "55",
+        "table": 0,
+        "page_def": page_def_templates["55"],
+        "header": header_templates["54"],
+    },
+    47: {
+        "page": "55",
+        "table": 1,
+        "page_def": page_def_templates["55"],
+        "header": header_templates["54"],
+    },
+    48: {
+        "page": "55",
+        "table": 2,
+        "page_def": page_def_templates["55"],
+        "header": header_templates["54"],
+    },
+}
+
+
+remove_per_table = [
+    [
+        "1.A.3.d.i di Navigation internationale (soutes internationales) (1)",
+        "1.A.3.a.i ai Aviation internationale (soutes internationales) (1)",
+        "1.A.5.c Opérations multilatérales (1) (2)",
+    ],  # these could also be removed globally as names slightly different
+    [],
+    [],
+    [],
+    [],
+    [],
+]
+
+fix_cat_values = {
+    "Catégorie": "Catégories",
+}
+
+fix_cat_using_preceeding = {  # fix cat code based on cat code before
+    "3.A.2.i Volaille": {"3.A.1.h Porcins": "3.A.1.i Volaille"},
+}
+
+# definitions for conversion to long format with standardized unit format
+unit_row = 0
+entity_row = 1
+unit_entity_rows = [unit_row, entity_row]
+
+index_cols = ["Catégories"]
+
+# special header as category code and name in one column
+header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
+
+overlap_problems = {
+    "1.A.2.i Industries extractives (à l’exclusion de l’extraction de combustibles) 113,528": [
+        "1.A.2.i Industries extractives (à l’exclusion de l’extraction de combustibles)",
+        "113,528",
+    ],
+}
+
+## definitions part 2: conversion to PRIMAP2 interchnage format
+
+# rows to remove
+cats_remove = ["Information Items", "Memo Items (3)"]
+
+# manual category codes
+cat_codes_manual = {
+    "Soutesinternationales": "M.BK",
+    "Émissions de CO2 imputables à la combustion de labiomasse pour la production d’énergie": "M.BIO",
+    "1.A.3.d.i Navigation internationale": "M.BK.M",
+    "1.A.3.a.i Aviation internationale": "M.BK.A",
+    "1.A.5.c - Opérations multilatérales": "M.MULTIOP",
+}
+
+cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,14})\s.*"
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+coords_defaults = {
+    "source": "Mauritania-GHG-inventory",
+    "provenance": "measured",
+    "area": "MRT",
+    "scenario": "BUR2",
+}
+
+coords_value_mapping = {
+    "unit": "PRIMAP1",
+    "entity": {
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
+        "COVNM": "NMVOC",
+        "Net CO2": "CO2",
+        "Émissions/ absorptions CO2": "CO2",
+        "Émissions/ absorptions nettes de CO2": "CO2",
+        "Autres gaz halogénés avec facteurs de conversion équivalent CO2": f"OTHERHFCS ({gwp_to_use})",
+        #'Other halogenated gases without CO2 equivalent conversion factors (2)': 'OTHERHFCS',
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "HFC-23": "HFC23",
+        "HFC-32": "HFC32",
+        "HFC-41": "HFC41",
+        "HFC-43-10mee": "HFC4310mee",
+        "HFC-125": "HFC125",
+        "HFC-134": "HFC134",
+        "HFC-134a": "HFC134a",
+        "HFC-152a": "HFC152a",
+        "HFC-143": "HFC143",
+        "HFC-143a": "HFC143a",
+        "HFC-227ea": "HFC227ea",
+        "HFC-236fa": "HFC236fa",
+        "HFC-245ca": "HFC245ca",
+        "c-C4F8": "cC4F8",
+    },
+}
+
+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
+
+add_coords_cols = {
+    "orig_cat_name": ["orig_cat_name", "category"],
+}
+
+filter_remove = {
+    "f1": {
+        "entity": ["Autres gaz halogénés sans facteurs de conversion équivalent CO2"],
+    },
+}
+
+meta_data = {
+    "references": "https://unfccc.int/documents/279303",
+    "rights": "",
+    "contact": "mail@johannes-guetschow.de",
+    "title": "République Islamique de Mauritanie - RAPPORT NATIONAL DES INVENTAIRES DES GAZ A EFFET DE SERRE - RNI",
+    "comment": "Read fom pdf file (Mauritania BUR 2 - NIR Annexes - May 2020.pdf) by Johannes Gütschow. ",
+    "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
+}
+
+# part 3 fgases defintions
+table_defs_fgases = {
+    1990: [0],
+    1995: [1, 2],
+    2000: [3],
+    2010: [4, 5],
+    2012: [6],
+    2015: [7, 8],
+    2018: [9],
+}
+pages_fgases = ["92", "92", "93", "93", "93", "94", "94", "94", "95", "95"]
+
+area_fgases = [
+    "55,508,833,280",
+    "55,263,833,50",
+    "55,530,833,511",
+    "55,491,833,264",
+    "55,244,833,51",
+    "55,532,833,493",
+    "55,473,833,245",
+    "55,224,833,53",
+    "55,530,833,473",
+    "55,430,833,200",
+]
+cols_fgases = [
+    "259,300,320,345,373,391,422,444,465,486,508,534,561,585,613,642,671,693,721,748,776,805"
+]
+
+rows_to_fix_fgases = {
+    2: ["2.F   Utilisations de produits comme substituts de substances"],
+    -3: ["Catégories"],
+}
+
+# definitions for conversion to long format with standardized unit format
+unit_row_fgases = 1
+entity_row_fgases = 0
+unit_entity_rows_fgases = [unit_row_fgases, entity_row_fgases]
+unit_info_fgases = {
+    "default_unit": "",
+    "regexp_entity": r"^.*",
+    "regexp_unit": None,  # temp fix until param is marked as optional in PRIMAP2
+    "manual_repl_unit": {
+        "Catégories": "",
+        "Émissions en unité de masse d’origine (tonne)": "t",
+    },
+}
+
+
+first_ignore_cat_fgases = "Émissions en unité équivalent CO2 (Gg Eq-CO2)"
+cats_remove_fgases = [
+    "Facteurs de conversion  équivalent CO2 [GWP du SAR sur 100 ans ]"
+]
+
+entities_to_remove_fgases = ["Total HFCs", "Total PFCs"]
+
+## processing
+proc_info_country = {
+    "aggregate_coords": {
+        "category": {
+            "2.D": {
+                "sources": ["2.D.1", "2.D.2", "2.D.3", "2.D.4"],
+                # 'name': 'Non-Energy Products from Fuels and Solvent Use'
+            },
+            "2.G.1": {
+                "sources": ["2.G.1.a"],
+                # 'name': 'Electrical Equipment'
+            },
+            "2.G": {
+                "sources": ["2.G.1", "2.G.2", "2.G.3", "2.G.4"],
+                # 'name': 'Other Product Manufacture and Use'
+            },
+            "2.F": {
+                "sources": ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
+                # 'name': 'Product uses as Substitutes for Ozone Depleting Substances'
+            },  # needed for fgases only
+            "2.H": {
+                "sources": ["2.H.1", "2.H.3"],
+                # 'name': 'Other'
+            },
+            "2": {
+                "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+                # 'name': 'IPPU'
+            },  # needed for fgases only
+            "M.3.C.1.AG": {
+                "sources": ["3.C.1.c"],
+                # 'name': 'Emissions from Biomass Burning (Agriculture)'
+            },
+            "M.3.C.AG": {
+                "sources": ["M.3.C.1.AG", "3.C.3", "3.C.4"],
+                # 'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'
+            },
+            "M.AG.ELV": {
+                "sources": ["M.3.C.AG"],
+                # 'name': 'Agriculture excluding livestock emissions'
+            },
+            "M.AG": {
+                "sources": ["3.A", "M.AG.ELV"],
+                # 'name': 'Agriculture'
+            },
+            "M.LULUCF": {
+                "sources": ["3.B"],
+                # 'name': 'Land Use, Land Use Change, and Forestry'
+            },
+            "3": {
+                "sources": ["M.AG", "M.LULUCF"],
+                # 'name': 'AFOLU'
+            },
+            "M.0.EL": {
+                "sources": ["1", "2", "M.AG", "4"],
+                # 'name': 'National Total Excluding LULUCF'
+            },
+            "0": {
+                "sources": ["1", "2", "3", "4"],
+                # 'name': 'National Total'
+            },  # neede for fgases only
+        },
+    },
+    "remove_ts": {
+        "2A_NMVOC": {  # should be 0
+            "category": ["2.A"],
+            "entities": ["NMVOC"],
+            "time": ["1990"],
+        },
+        "2D_NMVOC": {  # is 0 needs to be recomputed
+            "category": ["2.D"],
+            "entities": ["NMVOC"],
+            "time": ["2012"],
+        },
+    },
+}

+ 482 - 0
src/unfccc_ghg_data/unfccc_reader/Mauritania/read_MRT_BUR2_from_pdf.py

@@ -0,0 +1,482 @@
+"""
+Read data from Mauritania's BUR2.
+
+Data are read from pdf. The file contains a detailed inventory for
+1990, 1995, 2000, 2010, 2012, 2015, 1018.
+
+"""
+
+
+import camelot
+import numpy as np
+import pandas as pd
+import primap2 as pm2
+
+from unfccc_ghg_data.helper import (
+    compression,
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    gas_baskets,
+    process_data_for_country,
+)
+from unfccc_ghg_data.unfccc_reader.Mauritania.config_mrt_bur2 import (
+    area_fgases,
+    cat_code_regexp,
+    cat_codes_manual,
+    cats_remove,
+    cats_remove_fgases,
+    cols_fgases,
+    coords_cols,
+    coords_defaults,
+    coords_terminologies,
+    coords_value_mapping,
+    entities_to_remove_fgases,
+    entity_row,
+    entity_row_fgases,
+    filter_remove,
+    first_ignore_cat_fgases,
+    fix_cat_using_preceeding,
+    fix_cat_values,
+    gwp_to_use,
+    header_long,
+    index_cols,
+    meta_data,
+    pages_fgases,
+    proc_info_country,
+    remove_per_table,
+    rows_to_fix_fgases,
+    table_defs,
+    table_defs_fgases,
+    table_reading_defs,
+    terminology_proc,
+    unit_entity_rows,
+    unit_entity_rows_fgases,
+    unit_info_fgases,
+    unit_row,
+    unit_row_fgases,
+)
+
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+    input_folder = downloaded_data_path / "UNFCCC" / "Mauritania" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Mauritania"
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    output_filename = "MRT_BUR2_2020_"
+    inventory_file = "Mauritania_BUR_2_-_NIR_Annexes_-_May_2020.pdf"
+
+    # ###
+    # read the tables from pdf
+    # ###
+
+    ## main tables
+    # empty dataframe
+    df_all = None
+    for year in table_defs.keys():
+        print(f"Working on year {year}")
+
+        # join the tables which need combining
+        df_this_year = None
+        for table_parts, cats_remove_this_table in zip(
+            table_defs[year], remove_per_table
+        ):
+            new_table = camelot.read_pdf(
+                str(input_folder / inventory_file),
+                pages=table_reading_defs[table_parts[0]]["page"],
+                table_areas=[
+                    table_reading_defs[table_parts[0]]["page_def"]["area"][
+                        table_reading_defs[table_parts[0]]["table"]
+                    ]
+                ],
+                columns=[
+                    table_reading_defs[table_parts[0]]["page_def"]["cols"][
+                        table_reading_defs[table_parts[0]]["table"]
+                    ]
+                ],
+                flavor="stream",
+                split_text=True,
+            )
+            df_this_table = new_table[0].df
+            if "fix_rows" in table_reading_defs[table_parts[0]].keys():
+                rows_to_fix = table_reading_defs[table_parts[0]]["fix_rows"]
+                for n_rows in rows_to_fix:
+                    df_this_table = fix_rows(
+                        df_this_table,
+                        rows_to_fix=rows_to_fix[n_rows],
+                        col_to_use=0,
+                        n_rows=n_rows,
+                    )
+            if len(table_parts) > 1:
+                parts_remaining = table_parts[1:]
+                for part in parts_remaining:
+                    new_table = camelot.read_pdf(
+                        str(input_folder / inventory_file),
+                        pages=table_reading_defs[part]["page"],
+                        table_areas=[
+                            table_reading_defs[part]["page_def"]["area"][
+                                table_reading_defs[part]["table"]
+                            ]
+                        ],
+                        columns=[
+                            table_reading_defs[part]["page_def"]["cols"][
+                                table_reading_defs[part]["table"]
+                            ]
+                        ],
+                        flavor="stream",
+                        split_text=True,
+                    )
+                    df_new_table_part = new_table[0].df
+                    if "fix_rows" in table_reading_defs[part].keys():
+                        rows_to_fix = table_reading_defs[part]["fix_rows"]
+                        for n_rows in rows_to_fix:
+                            df_new_table_part = fix_rows(
+                                df_new_table_part,
+                                rows_to_fix=rows_to_fix[n_rows],
+                                col_to_use=0,
+                                n_rows=n_rows,
+                            )
+                    df_this_table = pd.concat([df_this_table, df_new_table_part])
+
+            df_this_table = df_this_table.reset_index(drop=True)
+
+            df_this_table = df_this_table.drop(
+                df_this_table.index[
+                    : table_reading_defs[table_parts[0]]["header"]["rows"]
+                ],
+            )
+            df_this_table.columns = [
+                table_reading_defs[table_parts[0]]["header"]["entity"],
+                table_reading_defs[table_parts[0]]["header"]["unit"],
+            ]
+
+            # replace '' by nan for filling
+            df_this_table.iloc[unit_entity_rows] = df_this_table.iloc[
+                unit_entity_rows
+            ].replace("", np.nan)
+            # fill the units to the right as for merged cells the unit is only
+            # in the first cell
+            df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].ffill(axis=0)
+            # fill entity from unit if empty
+            df_this_table.iloc[unit_entity_rows] = df_this_table.iloc[
+                unit_entity_rows
+            ].ffill()
+
+            # fix values in category col
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].replace(fix_cat_values)
+
+            # replace line breaks, double, and triple spaces in category names
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
+            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
+
+            # fix category values using preceding categories
+            for cat in fix_cat_using_preceeding:
+                mask = df_this_table.iloc[:, 0] == cat
+                if any(mask):
+                    print(f"Found occurence of category to fix {cat}")
+                    indices = np.where(mask)[0]
+                    for idx in indices:
+                        if (
+                            df_this_table.iloc[idx - 1, 0]
+                            in fix_cat_using_preceeding[cat].keys()
+                        ):
+                            df_this_table.iloc[idx, 0] = fix_cat_using_preceeding[cat][
+                                df_this_table.iloc[idx - 1, 0]
+                            ]
+                            print(
+                                f"Replaced {cat} by {fix_cat_using_preceeding[cat][df_this_table.iloc[idx - 1, 0]]}"  # noqa: E501
+                            )
+
+            # reindex because we have double indices
+            df_this_table = df_this_table.reset_index(drop=True)
+
+            # remove given rows
+            for cat in cats_remove_this_table:
+                # old_len = len(df_this_table)
+                df_this_table = df_this_table.drop(
+                    df_this_table[df_this_table[index_cols[0]] == cat].index
+                )
+                # new_len = len(df_this_table)
+                # print(f"Removed {old_len - new_len} rows from table for year {year}
+                # and category {cat}.")
+
+            # set index and convert to long format
+            df_this_table = df_this_table.set_index(index_cols)
+            # df_before_convert = df_this_table.copy(deep=True)
+            df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+                df_this_table, year, header_long
+            )
+
+            # combine with tables for other sectors (merge not append)
+            if df_this_year is None:
+                df_this_year = df_this_table_long
+            else:
+                df_this_year = pd.concat([df_this_year, df_this_table_long])
+
+        # aggregate years to df_all
+        if df_all is None:
+            df_all = df_this_year
+        else:
+            df_all = pd.concat([df_all, df_this_year])
+
+    df_all = df_all.reset_index(drop=True)
+
+    ## fgases ##############
+
+    df_all_fgases = None
+    for year in table_defs_fgases.keys():
+        print(f"Working on fgases year {year}")
+
+        # join the tables which need combining
+        table_parts = table_defs_fgases[year]
+        tables_fgases = camelot.read_pdf(
+            str(input_folder / inventory_file),
+            pages=pages_fgases[table_parts[0]],
+            table_areas=[area_fgases[table_parts[0]]],
+            columns=cols_fgases,
+            flavor="stream",
+            split_text=True,
+        )
+        df_this_year = tables_fgases[0].df.copy(deep=True)
+        if len(table_parts) > 1:
+            parts_remaining = table_parts[1:]
+            for part in parts_remaining:
+                tables_fgases = camelot.read_pdf(
+                    str(input_folder / inventory_file),
+                    pages=pages_fgases[part],
+                    table_areas=[area_fgases[part]],
+                    columns=cols_fgases,
+                    flavor="stream",
+                    split_text=True,
+                )
+                df_this_year = pd.concat([df_this_year, tables_fgases[0].df])
+
+        # reindex because we have double indices
+        df_this_year = df_this_year.reset_index(drop=True)
+
+        for n_rows in rows_to_fix_fgases:
+            df_this_year = fix_rows(
+                df_this_year,
+                rows_to_fix=rows_to_fix_fgases[n_rows],
+                col_to_use=0,
+                n_rows=n_rows,
+            )
+
+        # remove additional header rows
+        for cat in cats_remove_fgases:
+            df_this_year = df_this_year.drop(df_this_year[df_this_year[0] == cat].index)
+
+        # add category col label if missing
+        if (df_this_year.iloc[entity_row][0] == "") & (
+            str(df_this_year.iloc[unit_row][0]) == ""
+        ):
+            print(f"Add category header for table {table_parts[0]}")
+            df_this_year.iloc[entity_row][0] = index_cols[0]
+
+        # replace '' by nan for filling
+        df_this_year.iloc[unit_entity_rows_fgases] = df_this_year.iloc[
+            unit_entity_rows_fgases
+        ].replace("", np.nan)
+        # fill the units to the right as for merged cells the unit is only in
+        # the first cell
+        df_this_year.iloc[unit_row_fgases] = df_this_year.iloc[unit_row_fgases].fillna(
+            axis=0, method="ffill"
+        )
+
+        # replace line breaks in units and entities
+        df_this_year.iloc[entity_row_fgases] = df_this_year.iloc[
+            entity_row_fgases
+        ].str.replace("\n", "")
+        df_this_year.iloc[unit_row_fgases] = df_this_year.iloc[
+            unit_row_fgases
+        ].str.replace("\n", "")
+        df_this_year.iloc[entity_row_fgases] = df_this_year.iloc[
+            entity_row_fgases
+        ].str.replace("   ", " ")
+        df_this_year.iloc[unit_row_fgases] = df_this_year.iloc[
+            unit_row_fgases
+        ].str.replace("   ", " ")
+        df_this_year.iloc[entity_row_fgases] = df_this_year.iloc[
+            entity_row_fgases
+        ].str.replace("  ", " ")
+        df_this_year.iloc[unit_row_fgases] = df_this_year.iloc[
+            unit_row_fgases
+        ].str.replace("  ", " ")
+        df_this_year.iloc[entity_row_fgases] = df_this_year.iloc[
+            entity_row_fgases
+        ].str.strip()
+        df_this_year.iloc[unit_row_fgases] = df_this_year.iloc[
+            unit_row_fgases
+        ].str.strip()
+
+        # replace line breaks, double, and triple spaces in category names
+        df_this_year.iloc[:, 0] = df_this_year.iloc[:, 0].str.replace("\n", " ")
+        df_this_year.iloc[:, 0] = df_this_year.iloc[:, 0].str.replace("   ", " ")
+        df_this_year.iloc[:, 0] = df_this_year.iloc[:, 0].str.replace("  ", " ")
+
+        # set unit row cat label to nan
+        df_this_year.iloc[unit_row_fgases, 0] = np.nan
+
+        # remove second part of table with GWP weighted data
+        idx = df_this_year[
+            df_this_year.iloc[:, 0] == first_ignore_cat_fgases
+        ].index.tolist()[0]
+        df_this_year = df_this_year.loc[: idx - 1]
+
+        df_this_year = pm2.pm2io.nir_add_unit_information(
+            df_this_year,
+            unit_row=unit_row_fgases,
+            entity_row=entity_row_fgases,
+            **unit_info_fgases,
+        )
+
+        # remove entities
+        df_this_year = df_this_year.drop(columns=entities_to_remove_fgases)
+
+        # set index and convert to long format
+        df_this_year = df_this_year.set_index(index_cols)
+        df_this_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_year, year, header_long
+        )
+
+        # aggregate years to df_all
+        if df_all_fgases is None:
+            df_all_fgases = df_this_year_long
+        else:
+            df_all_fgases = pd.concat([df_all_fgases, df_this_year_long])
+
+    # combine with other data
+    df_all = pd.concat([df_all, df_all_fgases])
+
+    # drop the rows with memo items etc
+    for cat in cats_remove:
+        df_all = df_all.drop(df_all[df_all["orig_cat_name"] == cat].index)
+
+        # make a copy of the categories row
+    df_all["category"] = df_all["orig_cat_name"]
+
+    # temp: drop NOx and CO as the data is not read properly
+    # df_all = df_all.drop(df_all[df_all["entity"] == "CO"].index)
+    # df_all = df_all.drop(df_all[df_all["entity"] == "NOx"].index)
+
+    # replace cat names by codes in col "category"
+    # first the manual replacements
+    df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
+    # then the regex repalcements
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
+    df_all = df_all.reset_index(drop=True)
+
+    # replace "," with "." in data and remove space in number
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(",", ".", regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(". ", ".", regex=False)
+
+    # make sure all col headers are str
+    df_all.columns = df_all.columns.map(str)
+
+    data_if = pm2.pm2io.convert_long_dataframe_if(
+        df_all,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
+
+    data_if = data_if.drop(columns="orig_cat_name")
+    data_if.attrs["dimensions"]["*"].remove("orig_cat_name")
+
+    # conversion to PRIMAP2 native format
+    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+
+    # ###
+    # save data to IF and native format
+    # ###
+    data_if = data_pm2.pr.to_interchange_format()
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
+
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
+
+    ### processing
+    data_proc_pm2 = data_pm2  # copy not needed data_pm2 is not needed any more
+    # fix HFC values (code is more general than needed as prep for transfer
+    # into a function
+    HFC_fix = {  # SAR GWP while rest uses AR4
+        "dim": "category",
+        "source_value": "2",
+        "target_values": ["2.F", "2.F.1", "2.F.1.a"],
+        "filter": {
+            "variable": [f"HFCS ({gwp_to_use})"],
+            "time": ["2000", "2010"],
+        },
+    }
+    filter = HFC_fix["filter"]
+    variables = data_proc_pm2.data_vars
+    if "variable" in filter:
+        filter_vars = filter.pop("variable")
+        variables = [var for var in filter_vars if var in variables]
+
+    filter_source = filter.copy()
+    filter_source[HFC_fix["dim"]] = HFC_fix["source_value"]
+    for var in variables:
+        source_data = data_proc_pm2[var].pr.loc[filter_source]
+        for value in HFC_fix["target_values"]:
+            data_proc_pm2[var] = data_proc_pm2[var].pr.set(
+                HFC_fix["dim"], value, source_data, existing="overwrite"
+            )
+
+    # actual processing
+    data_proc_pm2 = process_data_for_country(
+        data_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        processing_info_country=proc_info_country,
+        cat_terminology_out=terminology_proc,
+    )
+
+    # adapt source and metadata
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
+    data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.loc[{"source": ["BUR_NIR"]}]
+
+    # ###
+    # save data to IF and native format
+    # ###
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + terminology_proc),
+        data_proc_if,
+    )
+
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + terminology_proc + ".nc"),
+        encoding=encoding,
+    )