11 månader sedan · b5cb1c03e8
--- a/UNFCCC_GHG_data/UNFCCC_reader/Mongolia/config_MNG_BUR2.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Mongolia/config_MNG_BUR2.py
@@ -0,0 +1,374 @@
 
				+coords_terminologies = {
			
 
				+    "area": "ISO3",
			
 
				+    "category": "IPCC2006_PRIMAP",
			
 
				+    "scenario": "PRIMAP",
			
 
				+}
			
 
				+
			
 
				+inv_conf = {
			
 
				+    "entity_row": 0,
			
 
				+    "unit_row": 1,
			
 
				+    "index_cols": "Greenhouse gas source and sink categories",
			
 
				+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
			
 
				+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
			
 
				+    "cat_codes_manual": {
			
 
				+        # remove whitespace at start of line
			
 
				+        " 2.G.2 -SF6 and PFCs from Other Product Uses": "2.G.2 - SF6 and PFCs from Other Product Uses",
			
 
				+        " 2.G.3 -N2O from Product Uses": "2.G.3 - N2O from Product Uses",
			
 
				+        " 1.C.1 -Transport of CO2": "1.C.1 - Transport of CO2",
			
 
				+        " 3.C.1 -Emissions from biomass burning ": "3.C.1",
			
 
				+        "Memo Items (5)": "MEMO",
			
 
				+        "International Bunkers": "M.BK",
			
 
				+        "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
			
 
				+        "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
			
 
				+        "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
			
 
				+        "Total National Emissions and Removals": "0",
			
 
				+    },
			
 
				+    "header": [
			
 
				+        "Greenhouse gas source and sink categories",
			
 
				+        "CO2",
			
 
				+        "CH4",
			
 
				+        "N2O",
			
 
				+        "HFCs",
			
 
				+        "PFCs",
			
 
				+        "SF6",
			
 
				+        "other halogenated gases",
			
 
				+        "Other halogenated gases without CO2 equivalent conversion factors",
			
 
				+        "NOx",
			
 
				+        "CO",
			
 
				+        "NMVOCs",
			
 
				+        "SO2",
			
 
				+    ],
			
 
				+    "unit": [
			
 
				+        "-",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "GgCO2eq",
			
 
				+        "GgCO2eq",
			
 
				+        "GgCO2eq",
			
 
				+        "GgCO2eq",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+    ],
			
 
				+}
			
 
				+
			
 
				+inv_conf_per_year = {
			
 
				+    "1990": {
			
 
				+        "pages_to_read": ["176", "177", "178", "179"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "1.A.2 - Manufacturing Industries and",
			
 
				+                "2.B.4 - Caprolactam. Glyoxal and Glyoxylic Acid",
			
 
				+                "2.B.8 - Petrochemical and Carbon Black",
			
 
				+                "2.D - Non-Energy Products from Fuels and",
			
 
				+                "2.F - Product Uses as Substitutes for Ozone",
			
 
				+                "3.C - Aggregate sources and non-CO2 emissions",
			
 
				+                "3.C.4 - Direct N2O Emissions from managed",
			
 
				+                "3.C.5 - Indirect N2O Emissions from managed",
			
 
				+                "3.C.6 - Indirect N2O Emissions from manure",
			
 
				+                "5.A - Indirect N2O emissions from the atmospheric",
			
 
				+                "1.A.3.d.i - International water-borne navigation",
			
 
				+                "1.A.3.a.i - International Aviation (International",
			
 
				+            ],
			
 
				+            -2: ["3.C.1 - Emissions from biomass burning"],
			
 
				+            2: [" 3.C.1 -Emissions from biomass burning"],
			
 
				+        },
			
 
				+        "page_defs": {
			
 
				+            "176": {
			
 
				+                "area": ["76,501,763,83"],
			
 
				+                "cols": ["265,320,360,396,433,471,503,564,624,658,694,741"],
			
 
				+            },
			
 
				+            "177": {
			
 
				+                "area": ["68,542,762,85"],
			
 
				+                "cols": ["280,329,374,410,449,482,546,604,637,679,725,751"],
			
 
				+            },
			
 
				+            "178": {
			
 
				+                "area": ["71,543,761,81"],
			
 
				+                "cols": ["265, 320,361,411,447,483,546,604,621,653,719,746"],
			
 
				+            },
			
 
				+            "179": {
			
 
				+                "area": ["70,542,761,346"],
			
 
				+                "cols": ["287,328,365,410,449,482,540,600,636,675,721,750"],
			
 
				+            },
			
 
				+        },
			
 
				+    },
			
 
				+    "2020": {
			
 
				+        "page_defs": {
			
 
				+            "180": {
			
 
				+                "area": ["70,436,769,86"],
			
 
				+                "cols": ["270, 322, 367, 405, 455, 488,550,607,637,669,727,753"],
			
 
				+            },
			
 
				+            "181": {
			
 
				+                "area": ["68,541,768,86"],
			
 
				+                "cols": ["288,343,379,405,460,490,559,600,650,683,729,755"],
			
 
				+            },
			
 
				+            "182": {
			
 
				+                "area": ["69, 539, 771, 86"],
			
 
				+                "cols": ["273,331,371,425,462,491,560,615,639,671,729,755"],
			
 
				+            },
			
 
				+            "183": {
			
 
				+                "area": ["69, 540, 769, 373"],
			
 
				+                "cols": ["288, 328,363,425,459,492,560,619,650,683,731,757"],
			
 
				+            },
			
 
				+        },
			
 
				+        "rows_to_fix": {
			
 
				+            -2: [
			
 
				+                "1.C.1 - Transport of CO2",
			
 
				+                "2.G.2 - SF6 and PFCs from Other Product Uses",
			
 
				+                "2.G.3 - N2O from Product Uses",
			
 
				+            ],
			
 
				+            2: [
			
 
				+                "2.B.8 - Petrochemical and Carbon Black",
			
 
				+                "2.D - Non-Energy Products from Fuels and",
			
 
				+                "2.F - Product Uses as Substitutes for Ozone",
			
 
				+                "3.C - Aggregate sources and non-CO2 emissions",
			
 
				+                "3.C.4 - Direct N2O Emissions from managed",
			
 
				+                "3.C.5 - Indirect N2O Emissions from managed",
			
 
				+                "3.C.6 - Indirect N2O Emissions from manure",
			
 
				+                "5.A - Indirect N2O emissions from the atmospheric",
			
 
				+                "1.A.3.d.i - International water-borne navigation",
			
 
				+                "1.A.3.a.i - International Aviation (International",
			
 
				+                "2.B.4 - Caprolactam. Glyoxal and Glyoxylic Acid",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+inv_conf_per_entity = {
			
 
				+    "CO": {
			
 
				+        "page_defs": {
			
 
				+            "39": {
			
 
				+                "area": ["53,646,550,588"],
			
 
				+                "cols": ["279,328,364,400,440,478,520"],
			
 
				+            },
			
 
				+        },
			
 
				+        "cat_codes_manual": {"Total National Emissions": "0"},
			
 
				+        "category_column": "Categories",
			
 
				+        "columns_to_drop": ["Categories"],
			
 
				+        "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg",
			
 
				+    },
			
 
				+    "NOx": {
			
 
				+        "page_defs": {
			
 
				+            "38": {
			
 
				+                "area": ["53,120,538,93"],
			
 
				+                "cols": ["281,329,365,405,441,477,513"],
			
 
				+            },
			
 
				+            "39": {
			
 
				+                "area": ["51,772,539,740"],
			
 
				+                "cols": ["285,332,368,404,444,476,514"],
			
 
				+            },
			
 
				+        },
			
 
				+        "cat_codes_manual": {"Total National Emissions": "0"},
			
 
				+        "category_column": "Categories",
			
 
				+        "columns_to_drop": ["Categories"],
			
 
				+        "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg",
			
 
				+    },
			
 
				+    "HFCs": {
			
 
				+        "page_defs": {
			
 
				+            "38": {
			
 
				+                "area": ["55,469,534,364"],
			
 
				+                "cols": ["251,302,367,427,486"],
			
 
				+            },
			
 
				+        },
			
 
				+        "cat_codes_manual": {"Total National Emissions (Gg CO2e)": "0"},
			
 
				+        "category_column": "Categories",
			
 
				+        "columns_to_drop": ["Share, %", "Categories"],
			
 
				+        "years": ["2007", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg CO2e",
			
 
				+    },
			
 
				+    "N2O": {
			
 
				+        "page_defs": {
			
 
				+            "37": {
			
 
				+                "area": ["55,106,556,79"],
			
 
				+                "cols": ["170,258,305,347,394,440,476,512"],
			
 
				+            },
			
 
				+            "38": {
			
 
				+                "area": ["55,773,555,664"],
			
 
				+                "cols": ["215,264,306,353,395,439,476,513"],
			
 
				+            },
			
 
				+        },
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "3 - Agriculture, Forestry, and Other",
			
 
				+                "3.C - Aggregate sources and non-",
			
 
				+                "4.D - Wastewater Treatment and",
			
 
				+            ]
			
 
				+        },
			
 
				+        "cat_codes_manual": {"Total National Emissions (Gg N2O)": "0"},
			
 
				+        "category_column": "Categories",
			
 
				+        "columns_to_drop": ["Share, %", "Categories"],
			
 
				+        "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg",
			
 
				+    },
			
 
				+    "CH4": {
			
 
				+        "page_defs": {
			
 
				+            "37": {
			
 
				+                "area": ["55,423,552,216"],
			
 
				+                "cols": ["186,250,296,326,383,427,467,507"],
			
 
				+            },
			
 
				+        },
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "1.A - Fuel Combustion",
			
 
				+                "1.B - Fugitive emissions from",
			
 
				+                "3 - Agriculture, Forestry, and",
			
 
				+                "3.C - Aggregate sources and",
			
 
				+                "4.D - Wastewater Treatment",
			
 
				+                "Total National Emissions (Gg",
			
 
				+            ]
			
 
				+        },
			
 
				+        "cat_codes_manual": {"Total National Emissions (Gg CH4)": "0"},
			
 
				+        "category_column": "Categories",
			
 
				+        "columns_to_drop": ["Share, %", "Categories"],
			
 
				+        "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg",
			
 
				+    },
			
 
				+    "CO2": {
			
 
				+        "page_defs": {
			
 
				+            "36": {
			
 
				+                "area": ["53,147,556,79"],
			
 
				+                "cols": ["150,204,254,306,352,406,459,513"],
			
 
				+            },
			
 
				+            "37": {
			
 
				+                "area": ["51,772,561,515"],
			
 
				+                "cols": ["151,202,252,305,357,404,463,517"],
			
 
				+            },
			
 
				+        },
			
 
				+        "rows_to_fix": {
			
 
				+            2: [
			
 
				+                "Categories",
			
 
				+                "Emissions and",
			
 
				+            ],
			
 
				+            3: [
			
 
				+                "1.A - Fuel",
			
 
				+                "1.B - Fugitive",
			
 
				+                "2 - Industrial Processes",
			
 
				+                "3 - Agriculture,",
			
 
				+                "Total National",
			
 
				+                "Total National",
			
 
				+            ],
			
 
				+            5: ["2.D - Non-Energy"],
			
 
				+            -2: [
			
 
				+                "Categories ",
			
 
				+                "Emissions and Removals (Gg CO2)",
			
 
				+            ],
			
 
				+        },
			
 
				+        "rows_to_drop": [
			
 
				+            "Total National Emissions (Gg CO2)",
			
 
				+            "Total National Removals (Gg CO2)",
			
 
				+        ],
			
 
				+        "columns_to_drop": ["Share, %", " Categories "],
			
 
				+        "cat_codes_manual": {"Total National Emissions and Removals (Gg CO2)": "0"},
			
 
				+        "category_column": " Categories ",
			
 
				+        "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
			
 
				+        "unit": "Gg",
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+# primap2 format conversion
			
 
				+coords_cols = {
			
 
				+    "category": "category",
			
 
				+    "entity": "entity",
			
 
				+    "unit": "unit",
			
 
				+}
			
 
				+
			
 
				+coords_defaults = {
			
 
				+    "source": "MNG-GHG-Inventory",
			
 
				+    "provenance": "measured",
			
 
				+    "area": "MNG",
			
 
				+    "scenario": "BUR2",
			
 
				+}
			
 
				+
			
 
				+coords_terminologies = {
			
 
				+    "area": "ISO3",
			
 
				+    "category": "IPCC2006_PRIMAP",
			
 
				+    "scenario": "PRIMAP",
			
 
				+}
			
 
				+
			
 
				+gwp_to_use = "SARGWP100"
			
 
				+coords_value_mapping = {
			
 
				+    "unit": "PRIMAP1",
			
 
				+    "category": "PRIMAP1",
			
 
				+    "entity": {
			
 
				+        "HFCs": f"HFCS ({gwp_to_use})",
			
 
				+        "PFCs": f"PFCS ({gwp_to_use})",
			
 
				+        "SF6": f"SF6 ({gwp_to_use})",
			
 
				+        "other halogenated gases": f"other halogenated gases ({gwp_to_use})",
			
 
				+        "NMVOCs": "NMVOC",
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+filter_remove = {
			
 
				+    "f_memo": {"category": "MEMO"},
			
 
				+    "f_empty": {"category": ""},
			
 
				+    "f2": {
			
 
				+        "entity": ["Other halogenated gases without CO2 equivalent conversion factors"],
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+meta_data = {
			
 
				+    "references": "https://unfccc.int/documents/633382",
			
 
				+    "rights": "",  # unknown
			
 
				+    "contact": "daniel-busch@climate-resource.de",
			
 
				+    "title": "Mongolia. Biennial update report (BUR). BUR2",
			
 
				+    "comment": "Read fom pdf by Daniel Busch",
			
 
				+    "institution": "UNFCCC",
			
 
				+}
			
 
				+
			
 
				+country_processing_step1 = {
			
 
				+    "tolerance": 0.01,
			
 
				+    "aggregate_cats": {
			
 
				+        # TODO: Remove "M.3.C.AG". Just here to see previous aggregation setup.
			
 
				+        # "M.3.C.AG": {
			
 
				+        #     "sources": [
			
 
				+        #         "3.C.1",
			
 
				+        #         "3.C.2",
			
 
				+        #         "3.C.3",
			
 
				+        #         "3.C.4",
			
 
				+        #         "3.C.5",
			
 
				+        #         "3.C.6",
			
 
				+        #         "3.C.7",
			
 
				+        #         "3.C.8",
			
 
				+        #     ],
			
 
				+        #     "name": "Aggregate sources and non-CO2 emissions sources on land "
			
 
				+        #     "(Agriculture)",
			
 
				+        # },
			
 
				+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
			
 
				+        # TODO: In this case 3.C should be equivalent to M.3.C.AG, but I'm not sure.
			
 
				+        "M.AG.ELV": {
			
 
				+            "sources": ["3.C", "M.3.D.AG"],
			
 
				+            "name": "Agriculture excluding livestock",
			
 
				+        },
			
 
				+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
			
 
				+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
			
 
				+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
			
 
				+        "M.0.EL": {
			
 
				+            "sources": ["1", "2", "M.AG", "4", "5"],
			
 
				+            "name": "National total emissions excluding LULUCF",
			
 
				+        },
			
 
				+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},  # consistency check
			
 
				+        "M.0.EL": {"sources": ["1", "2", "M.AG", "4"]},  # consistency check
			
 
				+        "0": {"sources": ["1", "2", "3", "4"]},  # consistency check
			
 
				+    },
			
 
				+    "basket_copy": {
			
 
				+        "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
			
 
				+        "entities": ["HFCS", "PFCS"],
			
 
				+        "source_GWP": gwp_to_use,
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+gas_baskets = {
			
 
				+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
			
 
				+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
			
 
				+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
			
 
				+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
			
 
				+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
			
 
				+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
			
 
				+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
			
 
				+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
			
 
				+}
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Mongolia/read_MNG_BUR2_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Mongolia/read_MNG_BUR2_from_pdf.py
@@ -0,0 +1,335 @@
 
				+import camelot
			
 
				+import primap2 as pm2
			
 
				+import pandas as pd
			
 
				+
			
 
				+from UNFCCC_GHG_data.helper import (
			
 
				+    downloaded_data_path,
			
 
				+    extracted_data_path,
			
 
				+    fix_rows,
			
 
				+    process_data_for_country,
			
 
				+)
			
 
				+from config_MNG_BUR2 import (
			
 
				+    inv_conf,
			
 
				+    inv_conf_per_year,
			
 
				+    inv_conf_per_entity,
			
 
				+    coords_cols,
			
 
				+    coords_defaults,
			
 
				+    coords_terminologies,
			
 
				+    coords_value_mapping,
			
 
				+    filter_remove,
			
 
				+    meta_data,
			
 
				+    country_processing_step1,
			
 
				+    gas_baskets,
			
 
				+)
			
 
				+
			
 
				+# ###
			
 
				+# configuration
			
 
				+# ###
			
 
				+
			
 
				+input_folder = downloaded_data_path / "UNFCCC" / "Mongolia" / "BUR2"
			
 
				+output_folder = extracted_data_path / "UNFCCC" / "Mongolia"
			
 
				+
			
 
				+if not output_folder.exists():
			
 
				+    output_folder.mkdir()
			
 
				+
			
 
				+pdf_file = "20231112_NIR_MGL.pdf"
			
 
				+output_filename = "MNG_BUR2_2023_"
			
 
				+category_column = f"category ({coords_terminologies['category']})"
			
 
				+compression = dict(zlib=True, complevel=9)
			
 
				+
			
 
				+# ###
			
 
				+# 1. Read in main tables
			
 
				+# ###
			
 
				+
			
 
				+df_main = None
			
 
				+for year in inv_conf_per_year.keys():
			
 
				+    print("-" * 60)
			
 
				+    print(f"Reading year {year}.")
			
 
				+    print("-" * 60)
			
 
				+    df_year = None
			
 
				+    for page in inv_conf_per_year[year]["page_defs"].keys():
			
 
				+        print(f"Reading table from page {page}.")
			
 
				+        tables_inventory_original = camelot.read_pdf(
			
 
				+            str(input_folder / pdf_file),
			
 
				+            pages=page,
			
 
				+            table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
			
 
				+            columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
			
 
				+            flavor="stream",
			
 
				+            split_text=True,
			
 
				+        )
			
 
				+        print("Reading complete.")
			
 
				+
			
 
				+        df_page = tables_inventory_original[0].df
			
 
				+
			
 
				+        if df_year is None:
			
 
				+            df_year = df_page
			
 
				+        else:
			
 
				+            df_year = pd.concat(
			
 
				+                [df_year, df_page],
			
 
				+                axis=0,
			
 
				+                join="outer",
			
 
				+            ).reset_index(drop=True)
			
 
				+
			
 
				+    print(f"Concatenating all tables for {year}.")
			
 
				+
			
 
				+    # fix content that spreads across multiple rows
			
 
				+    if "rows_to_fix" in inv_conf_per_year[year]:
			
 
				+        for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
			
 
				+            print(f"Merge content for {n_rows=}")
			
 
				+            df_year = fix_rows(
			
 
				+                df_year,
			
 
				+                rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
			
 
				+                col_to_use=0,
			
 
				+                n_rows=n_rows,
			
 
				+            )
			
 
				+
			
 
				+    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
			
 
				+
			
 
				+    skip_rows = 11
			
 
				+    df_year = pd.concat(
			
 
				+        [df_header, df_year[skip_rows:]], axis=0, join="outer"
			
 
				+    ).reset_index(drop=True)
			
 
				+
			
 
				+    df_year = pm2.pm2io.nir_add_unit_information(
			
 
				+        df_year,
			
 
				+        unit_row=inv_conf["unit_row"],
			
 
				+        entity_row=inv_conf["entity_row"],
			
 
				+        regexp_entity=".*",
			
 
				+        regexp_unit=".*",
			
 
				+        default_unit="Gg",
			
 
				+    )
			
 
				+
			
 
				+    print("Added unit information.")
			
 
				+
			
 
				+    # set index
			
 
				+    df_year = df_year.set_index(inv_conf["index_cols"])
			
 
				+
			
 
				+    # convert to long format
			
 
				+    df_year_long = pm2.pm2io.nir_convert_df_to_long(
			
 
				+        df_year, year, inv_conf["header_long"]
			
 
				+    )
			
 
				+
			
 
				+    # extract from tuple
			
 
				+    df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
			
 
				+
			
 
				+    # prep for conversion to PM2 IF and native format
			
 
				+    # make a copy of the categories row
			
 
				+    df_year_long["category"] = df_year_long["orig_cat_name"]
			
 
				+
			
 
				+    # replace cat names by codes in col "category"
			
 
				+    # first the manual replacements
			
 
				+
			
 
				+    df_year_long["category"] = df_year_long["category"].replace(
			
 
				+        inv_conf["cat_codes_manual"]
			
 
				+    )
			
 
				+
			
 
				+    df_year_long["category"] = df_year_long["category"].str.replace(".", "")
			
 
				+
			
 
				+    # then the regex replacements
			
 
				+    def repl(m):
			
 
				+        return m.group("code")
			
 
				+
			
 
				+    df_year_long["category"] = df_year_long["category"].str.replace(
			
 
				+        inv_conf["cat_code_regexp"], repl, regex=True
			
 
				+    )
			
 
				+
			
 
				+    df_year_long = df_year_long.reset_index(drop=True)
			
 
				+
			
 
				+    df_year_long["data"] = df_year_long["data"].str.replace(",", "")
			
 
				+
			
 
				+    # make sure all col headers are str
			
 
				+    df_year_long.columns = df_year_long.columns.map(str)
			
 
				+
			
 
				+    df_year_long = df_year_long.drop(columns=["orig_cat_name"])
			
 
				+
			
 
				+    if df_main is None:
			
 
				+        df_main = df_year_long
			
 
				+    else:
			
 
				+        df_main = pd.concat(
			
 
				+            [df_main, df_year_long],
			
 
				+            axis=0,
			
 
				+            join="outer",
			
 
				+        ).reset_index(drop=True)
			
 
				+
			
 
				+### convert to interchange format ###
			
 
				+print("Converting to interchange format.")
			
 
				+df_main_IF = pm2.pm2io.convert_long_dataframe_if(
			
 
				+    df_main,
			
 
				+    coords_cols=coords_cols,
			
 
				+    coords_defaults=coords_defaults,
			
 
				+    coords_terminologies=coords_terminologies,
			
 
				+    coords_value_mapping=coords_value_mapping,
			
 
				+    filter_remove=filter_remove,
			
 
				+    meta_data=meta_data,
			
 
				+    convert_str=True,
			
 
				+    time_format="%Y",
			
 
				+)
			
 
				+
			
 
				+### convert to primap2 format ###
			
 
				+print("Converting to primap2 format.")
			
 
				+data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
			
 
				+
			
 
				+# ###
			
 
				+# 2. Read in trend tables
			
 
				+# ###
			
 
				+
			
 
				+df_trend = None
			
 
				+for entity in inv_conf_per_entity.keys():
			
 
				+    print("-" * 60)
			
 
				+    print(f"Reading entity {entity}.")
			
 
				+
			
 
				+    df_entity = None
			
 
				+
			
 
				+    for page in inv_conf_per_entity[entity]["page_defs"].keys():
			
 
				+        print(f"Reading page {page}.")
			
 
				+
			
 
				+        tables_inventory_original = camelot.read_pdf(
			
 
				+            str(input_folder / pdf_file),
			
 
				+            pages=page,
			
 
				+            table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
			
 
				+            columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
			
 
				+            flavor="stream",
			
 
				+            split_text=True,
			
 
				+        )
			
 
				+        df_page = tables_inventory_original[0].df
			
 
				+
			
 
				+        if df_entity is None:
			
 
				+            df_entity = df_page
			
 
				+        else:
			
 
				+            df_entity = pd.concat(
			
 
				+                [df_entity, df_page],
			
 
				+                axis=0,
			
 
				+                join="outer",
			
 
				+            ).reset_index(drop=True)
			
 
				+        print(f"adding table from page {page}.")
			
 
				+
			
 
				+    if "rows_to_fix" in inv_conf_per_entity[entity]:
			
 
				+        for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
			
 
				+            print(f"Merge content for {n_rows=}")
			
 
				+            df_entity = fix_rows(
			
 
				+                df_entity,
			
 
				+                rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
			
 
				+                col_to_use=0,
			
 
				+                n_rows=n_rows,
			
 
				+            )
			
 
				+
			
 
				+    df_entity.columns = df_entity.iloc[0, :]
			
 
				+    df_entity = df_entity[1:]
			
 
				+
			
 
				+    # unit is always Gg
			
 
				+    df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
			
 
				+
			
 
				+    # only one entity per table
			
 
				+    df_entity.loc[:, "entity"] = entity
			
 
				+
			
 
				+    # TODO: Fix pandas "set value on slice of copy" warning
			
 
				+    df_entity.loc[:, "category"] = df_entity.loc[
			
 
				+        :, inv_conf_per_entity[entity]["category_column"]
			
 
				+    ]
			
 
				+
			
 
				+    if "rows_to_drop" in inv_conf_per_entity[entity]:
			
 
				+        for row in inv_conf_per_entity[entity]["rows_to_drop"]:
			
 
				+            row_to_delete = df_entity.index[df_entity["category"] == row][0]
			
 
				+            df_entity = df_entity.drop(index=row_to_delete)
			
 
				+
			
 
				+    df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
			
 
				+        inv_conf_per_entity[entity]["cat_codes_manual"]
			
 
				+    )
			
 
				+
			
 
				+    def repl(m):
			
 
				+        return m.group("code")
			
 
				+
			
 
				+    df_entity.loc[:, "category"] = df_entity["category"].str.replace(
			
 
				+        inv_conf["cat_code_regexp"], repl, regex=True
			
 
				+    )
			
 
				+
			
 
				+    df_entity = df_entity.drop(columns=inv_conf_per_entity[entity]["columns_to_drop"])
			
 
				+
			
 
				+    for year in inv_conf_per_entity[entity]["years"]:
			
 
				+        df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
			
 
				+
			
 
				+    if df_trend is None:
			
 
				+        df_trend = df_entity
			
 
				+    else:
			
 
				+        df_trend = pd.concat(
			
 
				+            [df_trend, df_entity],
			
 
				+            axis=0,
			
 
				+            join="outer",
			
 
				+        ).reset_index(drop=True)
			
 
				+
			
 
				+### convert to interchange format ###
			
 
				+df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
			
 
				+    data_wide=df_trend,
			
 
				+    coords_cols=coords_cols,
			
 
				+    coords_defaults=coords_defaults,
			
 
				+    coords_terminologies=coords_terminologies,
			
 
				+    coords_value_mapping=coords_value_mapping,
			
 
				+    # filter_remove=filter_remove,
			
 
				+    meta_data=meta_data,
			
 
				+    convert_str=True,
			
 
				+    time_format="%Y",
			
 
				+)
			
 
				+
			
 
				+### convert to primap2 format ###
			
 
				+print("Converting to primap2 format.")
			
 
				+data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
			
 
				+
			
 
				+# ###
			
 
				+# Merge main and trend tables.
			
 
				+# ###
			
 
				+
			
 
				+print("Merging main and trend table.")
			
 
				+data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
			
 
				+
			
 
				+# ###
			
 
				+# Save raw data to IF and native format.
			
 
				+# ###
			
 
				+
			
 
				+data_if = data_pm2.pr.to_interchange_format()
			
 
				+
			
 
				+pm2.pm2io.write_interchange_format(
			
 
				+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
			
 
				+    data_if,
			
 
				+)
			
 
				+
			
 
				+encoding = {var: compression for var in data_pm2.data_vars}
			
 
				+data_pm2.pr.to_netcdf(
			
 
				+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
			
 
				+    encoding=encoding,
			
 
				+)
			
 
				+
			
 
				+# ###
			
 
				+# Processing
			
 
				+# ###
			
 
				+
			
 
				+data_proc_pm2 = process_data_for_country(
			
 
				+    data_country=data_pm2,
			
 
				+    entities_to_ignore=[],
			
 
				+    gas_baskets=gas_baskets,
			
 
				+    filter_dims=None,
			
 
				+    cat_terminology_out=None,
			
 
				+    category_conversion=None,
			
 
				+    sectors_out=None,
			
 
				+    processing_info_country=country_processing_step1,
			
 
				+)
			
 
				+
			
 
				+# ###
			
 
				+# save processed data to IF and native format
			
 
				+# ###
			
 
				+
			
 
				+terminology_proc = coords_terminologies["category"]
			
 
				+
			
 
				+data_proc_if = data_proc_pm2.pr.to_interchange_format()
			
 
				+
			
 
				+if not output_folder.exists():
			
 
				+    output_folder.mkdir()
			
 
				+pm2.pm2io.write_interchange_format(
			
 
				+    output_folder / (output_filename + terminology_proc), data_proc_if
			
 
				+)
			
 
				+
			
 
				+encoding = {var: compression for var in data_proc_pm2.data_vars}
			
 
				+data_proc_pm2.pr.to_netcdf(
			
 
				+    output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
			
 
				+)
			
 
				+
			
 
				+print("Saved processed data.")