Просмотр исходного кода

script that reads first two years

Daniel Busch 11 месяцев назад
Родитель
Сommit
0bcd5dcfec

+ 94 - 0
UNFCCC_GHG_data/UNFCCC_reader/Burundi/config_BDI_BUR1.py

@@ -0,0 +1,94 @@
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+# define config dict
+inv_conf = {
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+    "2005": {
+        "pages_to_read": ["197", "198", "199", "200"],
+        "header": [
+            "Greenhouse gas source and sink categories",
+            "CO2",
+            "CH4",
+            "N2O",
+            "HFCs",
+            "PFCs",
+            "SF6",
+            "Other halogenated gases with CO2 equivalent conversion factors",
+            "Other halogenated gases without CO2 equivalent conversion factors" "NOx",
+            "CO",
+            "NMVOCs",
+            "SO2",
+        ],
+        "unit": [
+            "-",
+            "Gg",
+            "Gg",
+            "Gg",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+        ],
+        "cat_codes_manual": {
+            "Memo Items (5)": "MEMO",
+            "International Bunkers": "M.BK",
+            "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
+            "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
+            "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
+            "Total National Emissions and Removals": "0",
+        },
+    },
+    "2006": {
+        "pages_to_read": ["201", "202", "203", "204"],
+        "header": [
+            "Greenhouse gas source and sink categories",
+            "CO2",
+            "CH4",
+            "N2O",
+            "HFCs",
+            "PFCs",
+            "SF6",
+            "Other halogenated gases with CO2 equivalent conversion factors",
+            "Other halogenated gases without CO2 equivalent conversion factors" "NOx",
+            "CO",
+            "NMVOCs",
+            "SO2",
+        ],
+        "unit": [
+            "-",
+            "Gg",
+            "Gg",
+            "Gg",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+        ],
+        "cat_codes_manual": {
+            "Memo Items (5)": "MEMO",
+            "International Bunkers": "M.BK",
+            "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
+            "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
+            "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
+            "Total National Emissions and Removals": "0",
+        },
+    },
+}

+ 135 - 0
UNFCCC_GHG_data/UNFCCC_reader/Burundi/read_BDI_BUR1_from_pdf.py

@@ -0,0 +1,135 @@
+import os
+
+os.environ["UNFCCC_GHG_ROOT_PATH"] = (
+    "/Users/danielbusch/Documents/UNFCCC_non-AnnexI_data"
+)
+
+import camelot
+import primap2 as pm2
+import pandas as pd
+
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+
+from config_BDI_BUR1 import coords_terminologies, inv_conf
+
+# ###
+# configuration
+# ###
+
+input_folder = downloaded_data_path / "UNFCCC" / "Burundi" / "BUR1"
+output_folder = extracted_data_path / "UNFCCC" / "Burundi"
+
+if not output_folder.exists():
+    output_folder.mkdir()
+
+pdf_file = "Burundi_BUR_1_Report__Francais.pdf"
+output_filename = "BDI_BUR1_2023_"
+category_column = f"category ({coords_terminologies['category']})"
+compression = dict(zlib=True, complevel=9)
+
+# ###
+# 1. Read in tables
+# ###
+
+# table for the year 2005
+year = "2005"
+years_to_read = ["2005", "2006"]
+df_all = None
+for year in years_to_read:
+    df_year = None
+    for page in inv_conf[year]["pages_to_read"]:
+        print("-" * 45)
+        print(f"Reading table from page {page}.")
+
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=page,
+            # table_areas=page_def_templates[page]["area"],
+            # columns=page_def_templates[page]["cols"],
+            flavor="lattice",
+            split_text=True,
+        )
+
+        print("Reading complete.")
+
+        df_page = tables_inventory_original[0].df
+
+        if df_year is None:
+            df_year = df_page
+        else:
+            df_year = pd.concat(
+                [df_year, df_page],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    # remove line breaks
+    for column in df_year.columns:
+        df_year[column] = df_year[column].str.replace("\n", "")
+
+    df_header = pd.DataFrame([inv_conf[year]["header"], inv_conf[year]["unit"]])
+
+    df_year = pd.concat([df_header, df_year[2:]], axis=0, join="outer").reset_index(
+        drop=True
+    )
+
+    df_year = pm2.pm2io.nir_add_unit_information(
+        df_year,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
+
+    print("Added unit information.")
+
+    # set index
+    df_year = df_year.set_index(inv_conf["index_cols"])
+
+    # convert to long format
+    df_year_long = pm2.pm2io.nir_convert_df_to_long(
+        df_year, year, inv_conf["header_long"]
+    )
+
+    # extract from tuple
+    df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
+
+    # prep for conversion to PM2 IF and native format
+    # make a copy of the categories row
+    df_year_long["category"] = df_year_long["orig_cat_name"]
+
+    # replace cat names by codes in col "category"
+    # first the manual replacements
+    df_year_long["category"] = df_year_long["category"].str.replace("\n", "")
+    df_year_long["category"] = df_year_long["category"].replace(
+        inv_conf["2005"]["cat_codes_manual"]
+    )
+
+    df_year_long["category"] = df_year_long["category"].str.replace(".", "")
+
+    # then the regex replacements
+    def repl(m):
+        return m.group("code")
+
+    df_year_long["category"] = df_year_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
+
+    df_year_long = df_year_long.reset_index(drop=True)
+
+    df_year_long["data"] = df_year_long["data"].str.replace(",", ".")
+    df_year_long["data"] = df_year_long["data"].str.replace("NE1", "NE")
+
+    # make sure all col headers are str
+    df_year_long.columns = df_year_long.columns.map(str)
+    df_year_long = df_year_long.drop(columns=["orig_cat_name"])
+
+    if df_all is None:
+        df_all = df_year_long
+    else:
+        df_all = pd.concat(
+            [df_all, df_year_long],
+            axis=0,
+            join="outer",
+        ).reset_index(drop=True)