Browse Source

convert to interchange format and primap 2 format

Daniel Busch 11 months ago
parent
commit
551d6ab845

+ 159 - 73
UNFCCC_GHG_data/UNFCCC_reader/Burundi/config_BDI_BUR1.py

@@ -4,6 +4,25 @@ coords_terminologies = {
     "scenario": "PRIMAP",
 }
 
+# TODO: This is duplicate inormation
+years_to_read = [
+    "2005",
+    "2006",
+    "2007",
+    "2008",
+    "2009",
+    "2010",
+    "2011",
+    "2012",
+    "2013",
+    "2014",
+    "2015",
+    "2016",
+    "2017",
+    "2018",
+    "2019",
+]
+
 # define config dict
 inv_conf = {
     "entity_row": 0,
@@ -11,84 +30,151 @@ inv_conf = {
     "index_cols": "Greenhouse gas source and sink categories",
     "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
     "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+    "header": [
+        "Greenhouse gas source and sink categories",
+        "CO2",
+        "CH4",
+        "N2O",
+        "HFCs",
+        "PFCs",
+        "SF6",
+        "Other halogenated gases with CO2 equivalent conversion factors",
+        "Other halogenated gases without CO2 equivalent conversion factors",
+        "NOx",
+        "CO",
+        "NMVOCs",
+        "SO2",
+    ],
+    "unit": [
+        "-",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+        "GgCO2eq",
+        "GgCO2eq",
+        "GgCO2eq",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+    ],
+    "cat_codes_manual": {
+        "Memo Items (5)": "MEMO",
+        "International Bunkers": "M.BK",
+        "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
+        "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
+        # TODO: handle with regex
+        "1.A.3.d.i - International water-borne navigation (International                      bunkers) (1)": "M.BK.M",
+        "1.A.3.d.i - International water-borne navigation (International bunkers)                      (1)": "M.BK.M",
+        "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
+        "Total National Emissions and Removals": "0",
+    },
     "2005": {
         "pages_to_read": ["197", "198", "199", "200"],
-        "header": [
-            "Greenhouse gas source and sink categories",
-            "CO2",
-            "CH4",
-            "N2O",
-            "HFCs",
-            "PFCs",
-            "SF6",
-            "Other halogenated gases with CO2 equivalent conversion factors",
-            "Other halogenated gases without CO2 equivalent conversion factors" "NOx",
-            "CO",
-            "NMVOCs",
-            "SO2",
-        ],
-        "unit": [
-            "-",
-            "Gg",
-            "Gg",
-            "Gg",
-            "GgCO2eq",
-            "GgCO2eq",
-            "GgCO2eq",
-            "GgCO2eq",
-            "Gg",
-            "Gg",
-            "Gg",
-            "Gg",
-            "Gg",
-        ],
-        "cat_codes_manual": {
-            "Memo Items (5)": "MEMO",
-            "International Bunkers": "M.BK",
-            "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
-            "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
-            "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
-            "Total National Emissions and Removals": "0",
-        },
     },
     "2006": {
         "pages_to_read": ["201", "202", "203", "204"],
-        "header": [
-            "Greenhouse gas source and sink categories",
-            "CO2",
-            "CH4",
-            "N2O",
-            "HFCs",
-            "PFCs",
-            "SF6",
-            "Other halogenated gases with CO2 equivalent conversion factors",
-            "Other halogenated gases without CO2 equivalent conversion factors" "NOx",
-            "CO",
-            "NMVOCs",
-            "SO2",
-        ],
-        "unit": [
-            "-",
-            "Gg",
-            "Gg",
-            "Gg",
-            "GgCO2eq",
-            "GgCO2eq",
-            "GgCO2eq",
-            "GgCO2eq",
-            "Gg",
-            "Gg",
-            "Gg",
-            "Gg",
-            "Gg",
+    },
+    "2007": {
+        "pages_to_read": ["205", "206", "207", "208"],
+    },
+    "2008": {
+        "pages_to_read": ["209", "210", "211", "212"],
+    },
+    "2009": {
+        "pages_to_read": ["213", "214", "215", "216"],
+    },
+    "2010": {
+        "pages_to_read": ["221", "222", "223", "224"],
+    },
+    "2011": {
+        "pages_to_read": ["225", "226", "227", "228"],
+    },
+    "2012": {
+        "pages_to_read": ["229", "230", "231", "232"],
+    },
+    "2013": {
+        "pages_to_read": ["233", "234", "235", "236"],
+    },
+    "2014": {
+        "pages_to_read": ["237", "238", "239", "240"],
+    },
+    "2015": {
+        "pages_to_read": ["241", "242", "243", "244"],
+        "fix_values": [
+            (2, 10, "21,529"),
+            (1, 12, "NMVOCs"),
+            (2, 12, "0"),
         ],
-        "cat_codes_manual": {
-            "Memo Items (5)": "MEMO",
-            "International Bunkers": "M.BK",
-            "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
-            "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
-            "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
-            "Total National Emissions and Removals": "0",
-        },
+        "delete_columns": [11],
+    },
+    "2016": {
+        "pages_to_read": ["245", "246", "247", "248"],
+    },
+    "2017": {
+        "pages_to_read": ["249", "250", "251", "252"],
+    },
+    "2018": {
+        "pages_to_read": ["253", "254", "255", "256"],
+    },
+    "2019": {
+        "pages_to_read": ["257", "258", "259", "260"],
+    },
+}
+
+# primap2 format conversion
+coords_cols = {
+    "category": "category",
+    "entity": "entity",
+    "unit": "unit",
+}
+
+coords_defaults = {
+    "source": "BDI-GHG-Inventory",
+    "provenance": "measured",
+    "area": "BDI",
+    "scenario": "BUR1",
+}
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+
+gwp_to_use = "AR4GWP100"
+coords_value_mapping = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+    "entity": {
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
+        # "Other halogenated gases with CO2 equivalent conversion factors" : "PLACEHOLDER halo gases co2eq",
+        # "Other halogenated gases without CO2 equivalent conversion factors" : "PLACEHOLDER halo gases"
     },
 }
+
+filter_remove = {
+    "f_memo": {"category": "MEMO"},
+    "f_empty": {"category": ""},
+    "f1": {
+        "entity": ["Other halogenated gases with CO2 equivalent conversion factors"],
+    },
+    "f2": {
+        "entity": ["Other halogenated gases without CO2 equivalent conversion factors"],
+    },
+}
+
+meta_data = {
+    "references": "https://unfccc.int/documents/611668",
+    "rights": "",  # unknown
+    "contact": "daniel-busch@climate-resource.de",
+    "title": "Burundi. Biennial update report (BUR). BUR1",
+    "comment": "Read fom pdf by Daniel Busch",
+    "institution": "UNFCCC",
+}

+ 50 - 6
UNFCCC_GHG_data/UNFCCC_reader/Burundi/read_BDI_BUR1_from_pdf.py

@@ -10,7 +10,16 @@ import pandas as pd
 
 from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
-from config_BDI_BUR1 import coords_terminologies, inv_conf
+from config_BDI_BUR1 import (
+    inv_conf,
+    meta_data,
+    filter_remove,
+    coords_value_mapping,
+    coords_terminologies,
+    coords_defaults,
+    coords_cols,
+    years_to_read,
+)
 
 # ###
 # configuration
@@ -32,13 +41,15 @@ compression = dict(zlib=True, complevel=9)
 # ###
 
 # table for the year 2005
-year = "2005"
-years_to_read = ["2005", "2006"]
+
 df_all = None
 for year in years_to_read:
+    print("-" * 60)
+    print(f"Reading year {year}.")
+    print("-" * 60)
     df_year = None
     for page in inv_conf[year]["pages_to_read"]:
-        print("-" * 45)
+        print("-" * 20)
         print(f"Reading table from page {page}.")
 
         tables_inventory_original = camelot.read_pdf(
@@ -67,7 +78,19 @@ for year in years_to_read:
     for column in df_year.columns:
         df_year[column] = df_year[column].str.replace("\n", "")
 
-    df_header = pd.DataFrame([inv_conf[year]["header"], inv_conf[year]["unit"]])
+    # fix broken values in cells
+    if "fix_values" in inv_conf[year].keys():
+        for index, column, value in inv_conf[year]["fix_values"]:
+            df_year.at[index, column] = value
+
+    # delete extra columns
+    if "delete_columns" in inv_conf[year].keys():
+        for column in inv_conf[year]["delete_columns"]:
+            print(f"Delete columns {column} for year {year}")
+            df_year = df_year.drop(columns=column)
+        df_year.columns = range(df_year.columns.size)
+
+    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
 
     df_year = pd.concat([df_header, df_year[2:]], axis=0, join="outer").reset_index(
         drop=True
@@ -102,8 +125,9 @@ for year in years_to_read:
     # replace cat names by codes in col "category"
     # first the manual replacements
     df_year_long["category"] = df_year_long["category"].str.replace("\n", "")
+
     df_year_long["category"] = df_year_long["category"].replace(
-        inv_conf["2005"]["cat_codes_manual"]
+        inv_conf["cat_codes_manual"]
     )
 
     df_year_long["category"] = df_year_long["category"].str.replace(".", "")
@@ -125,6 +149,8 @@ for year in years_to_read:
     df_year_long.columns = df_year_long.columns.map(str)
     df_year_long = df_year_long.drop(columns=["orig_cat_name"])
 
+    assert "1A3di" not in df_year_long["category"].unique()
+
     if df_all is None:
         df_all = df_year_long
     else:
@@ -133,3 +159,21 @@ for year in years_to_read:
             axis=0,
             join="outer",
         ).reset_index(drop=True)
+
+
+print("Converting to interchange format.")
+df_all_IF = pm2.pm2io.convert_long_dataframe_if(
+    df_all,
+    coords_cols=coords_cols,
+    coords_defaults=coords_defaults,
+    coords_terminologies=coords_terminologies,
+    coords_value_mapping=coords_value_mapping,
+    filter_remove=filter_remove,
+    meta_data=meta_data,
+    convert_str=True,
+    time_format="%Y",
+)
+
+print("Converting to primap2 format.")
+### convert to primap2 format ###
+data_pm2_all = pm2.pm2io.from_interchange_format(df_all_IF)