Kaynağa Gözat

first trend table

Daniel Busch 6 ay önce
ebeveyn
işleme
bb46722613

+ 95 - 1
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/config_kna_bur1.py

@@ -2,7 +2,7 @@
 Configuration file to read Saint Kitts and Nevis' BUR 1.
 
 Tables to read:
-- The sector tables in the Annex from page 149
+- The sector tables in the Annex from page 149 - done
 - trend tables page 111-113
 - page 117
 - page 118
@@ -69,6 +69,100 @@ conf_general = {
     "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
 }
 
+conf_trend = {
+    "overview": {
+        "fix_single_value": {
+            "cat": "MBIO",
+            "year": "2018",
+            "new_value": "0.17",
+        },
+        "entity": f"KYOTOGHG ({gwp_to_use})",
+        "unit": "GgCO2eq",
+        "replace_data_entries": {"NO,NE": "NO"},
+        "cat_codes_manual": {
+            "Total CO2 Eq. Emissions without  LULUCF": "M.0.EL",
+            "Total CO2 Eq. Emissions with  LULUCF": "M.LULUCF",
+            # "1. Energy": "1. Energy",
+            "A. Fuel Combustion": "1.A",
+            "1.  Energy Industries": "1.A.1",
+            "2.  Man. Ind. & Constr.": "1.A.2",
+            "3.  Transport": "1.A.3",
+            "4.  Other Sectors": "1.A.4",
+            "5.  Other": "1.A.5",
+            "B. Fugitive Emissions from Fuels": "1.B",
+            "1.  Solid Fuels": "1.B.1",
+            "2.  Oil and Natural Gas and other…": "1.B.2",
+            # "2.  Industrial Processes": "2.  Industrial Processes",
+            "A.  Mineral Industry": "2.A",
+            "B.  Chemical Industry": "2.B",
+            "C.  Metal Industry": "2.C",
+            "D.  Non-energy products": "2.D",
+            "E.  Electronics industry": "2.E",
+            "F.  Product uses as ODS substitutes": "2.F",
+            "G.  Other product manufacture and": "2.G",
+            "use  H.  Other": "2.H",
+            # "3.  Agriculture": "3.  Agriculture",
+            "A.  Enteric Fermentation": "3.A.1",
+            "B.  Manure Management": "3.A.2",
+            "C.  Rice Cultivation": "3.C.7",
+            "D.  Agricultural Soils": "3.C.4",  # TODO confirm!
+            "E.  Prescribed Burning of Savannahs": "3.C.1.d",  # TODO confirm!
+            "F.  Field Burning of Agricultural": "3.C.1.b",  # TODO confirm!
+            "Residues  G.  Liming": "3.C.2",
+            "H.  Urea applications": "3.C.3",
+            "I.  Other carbon-containing": "3.D",  # TODO confirm!
+            "fertilisers  4. Land Use, Land-Use Change and  Forestry": "3.B",
+            "A. Forest Land": "3.B.1",
+            "B. Cropland": "3.B.2",
+            "C. Grassland": "3.B.3",
+            "D. Wetlands": "3.B.4",
+            "E. Settlements": "3.B.5",
+            "F. Other Land": "3.B.6",
+            "G. Harvested wood products": "3.D.1",
+            "H. Other": "3.D.2",
+            "5. Waste": "4",
+            "A.  Solid Waste Disposal": "4.A",
+            "B.  Biological treatment of solid": "4.B",
+            "waste  C. Incineration and open burning of": "4.C",
+            "D. Waste water treatment and": "4.D",
+            "discharge  E.  Other": "4.E",
+            "6.  Other": "5",
+            "CO2 Emissions from Biomass": "M.BIO",
+        },
+        "drop_cols": [
+            "change to BY",
+            "change to PY",
+        ],
+        "header": ["orig_category"],
+        "years": [
+            "2008",
+            "2009",
+            "2010",
+            "2011",
+            "2012",
+            "2013",
+            "2014",
+            "2015",
+            "2016",
+            "2017",
+            "2018",
+        ],
+        "extra_columns": [
+            "change to BY",
+            "change to PY",
+        ],
+        "split_values": {
+            "cat": "3B2",
+            "keep_value_no": 1,
+        },
+        "page_defs": {
+            "111": {"skip_rows_start": 1},
+            "112": {"skip_rows_start": 1},
+            "113": {"skip_rows_start": 1},
+        },
+    }
+}
+
 conf = {
     "energy": {
         # TODO: List of entities are always keys of unit mapping dict

+ 166 - 0
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/read_KNA_BUR1_from_pdf.py

@@ -9,6 +9,7 @@ from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.unfccc_reader.Saint_Kitts_and_Nevis.config_kna_bur1 import (
     conf,
     conf_general,
+    conf_trend,
     coords_cols,
     coords_defaults,
     coords_terminologies,
@@ -34,6 +35,171 @@ if __name__ == "__main__":
     def repl(m):  # noqa: D103
         return m.group("code")
 
+    # ###
+    # 2. Read trend tables
+    # ###
+
+    df_trend = None
+    for table in reversed(conf_trend.keys()):
+        print("-" * 45)
+        print(f"Reading {table} trend table.")
+        df_table = None
+        for page in conf_trend[table]["page_defs"].keys():
+            print(f"Page {page}")
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                flavor="lattice",
+                split_text=True,
+            )
+
+            df_page = tables_inventory_original[0].df
+
+            skip_rows_start = conf_trend[table]["page_defs"][page]["skip_rows_start"]
+            if not skip_rows_start == 0:
+                df_page = df_page[skip_rows_start:]
+
+            if df_table is None:
+                # Reset index to avoid pandas' SettingWithCopyWarning
+                df_table = df_page.reset_index(drop=True)
+            else:
+                df_table = pd.concat(
+                    [
+                        df_table,
+                        df_page,
+                    ],
+                    axis=0,
+                    join="outer",
+                ).reset_index(drop=True)
+
+        df_table.columns = (
+            conf_trend[table]["header"]
+            + conf_trend[table]["years"]
+            + conf_trend[table]["extra_columns"]
+        )
+
+        # drop columns if needed
+        if "drop_cols" in conf_trend[table].keys():
+            df_table = df_table.drop(columns=conf_trend[table]["drop_cols"])
+
+        # category codes from category names
+        df_table["category"] = df_table["orig_category"]
+        # Remove line break characters
+        df_table["category"] = df_table["category"].str.replace("\n", " ")
+        # first the manual replacements
+        df_table["category"] = df_table["category"].replace(
+            conf_trend[table]["cat_codes_manual"]
+        )
+        # remove dots from category codes
+        df_table["category"] = df_table["category"].str.replace(".", "")
+        # then the regex replacements
+        df_table["category"] = df_table["category"].str.replace(
+            conf_general["cat_code_regexp"], repl, regex=True
+        )
+
+        df_table = df_table.drop(columns="orig_category")
+
+        # clean values
+        for year in conf_trend[table]["years"]:
+            df_table[year] = df_table[year].replace(
+                conf_trend[table]["replace_data_entries"]
+            )
+            df_table[year] = df_table[year].str.replace("\n", "")
+            df_table[year] = df_table[year].str.replace(",", ".")
+            # invisible numbers in trend table on page 112
+            if "split_values" in conf_trend[table].keys():
+                cat = conf_trend[table]["split_values"]["cat"]
+                keep_value_no = conf_trend[table]["split_values"]["keep_value_no"]
+                new_value = (
+                    df_table.loc[df_table["category"] == cat, year]
+                    .item()
+                    .split(" ")[keep_value_no]
+                )
+                df_table.loc[df_table["category"] == cat, year] = new_value
+
+        if "fix_single_value" in conf_trend[table].keys():
+            cat = conf_trend[table]["fix_single_value"]["cat"]
+            year = conf_trend[table]["fix_single_value"]["year"]
+            new_value = conf_trend[table]["fix_single_value"]["new_value"]
+            df_table.loc[df_table["category"] == cat, year] = new_value
+
+        df_table["unit"] = conf_trend[table]["unit"]
+        df_table["entity"] = conf_trend[table]["entity"]
+
+        # stack the tables vertically
+        if df_trend is None:
+            df_trend = df_table.reset_index(drop=True)
+        else:
+            df_trend = pd.concat(
+                [
+                    df_trend,
+                    df_table,
+                ],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    #     # fill empty strings with NaN and the forward fill category names
+    #     df_page["category"] = df_page["category"].replace("", np.nan).ffill()
+    #
+    #     # remove /n from category names
+    #     df_page["category"] = df_page["category"].str.replace("\n", "")
+    #     # manual replacement of categories
+    #     df_page["category"] = df_page["category"].replace(
+    #         inv_conf_per_sector[sector]["cat_codes_manual"]
+    #     )
+    #
+    #     # remove all thousand separator commas
+    #     for year in trend_years :
+    #         df_page[year] = df_page[year].str.replace(",", ".")
+    #
+    #     # add unit
+    #     df_page["unit"] = inv_conf_per_sector[sector]["unit"]
+    #
+    #     # add entity if needed
+    #     if "entity" in inv_conf_per_sector[sector].keys() :
+    #         df_page["entity"] = inv_conf_per_sector[sector]["entity"]
+    #
+    #     if "unit_conversion" in inv_conf_per_sector[sector].keys() :
+    #         for year in trend_years :
+    #             index = inv_conf_per_sector[sector]["unit_conversion"]["index"]
+    #             conv_factor = inv_conf_per_sector[sector]["unit_conversion"][
+    #                 "conversion_factor"
+    #             ]
+    #             df_page.loc[index, year] = str(
+    #                 conv_factor * float(df_page.loc[index, year])
+    #             )
+    #
+    #     # stack the tables vertically
+    #     if df_trend is None :
+    #         df_trend = df_page
+    #     else :
+    #         df_trend = pd.concat(
+    #             [
+    #                 df_trend,
+    #                 df_page,
+    #             ],
+    #             axis=0,
+    #             join="outer",
+    #         ).reset_index(drop=True)
+    #
+    df_trend_if = pm2.pm2io.convert_wide_dataframe_if(
+        df_trend,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+    )
+    #
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_if)
+
     # ###
     # 1. Read in main tables
     # ###