Browse Source

energy industries trends

Daniel Busch 6 tháng trước cách đây
mục cha
commit
9af5cfe477

+ 43 - 7
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/config_kna_bur1.py

@@ -3,9 +3,9 @@ Configuration file to read Saint Kitts and Nevis' BUR 1.
 
 Tables to read:
 - The sector tables in the Annex from page 149 - done
-- trend tables page 111-113
-- page 117
-- page 118
+- trend tables page 111-113 - done
+- page 116 - done
+- page 118- work in progress
 - page 119
 - page 121
 - page 124
@@ -70,6 +70,42 @@ conf_general = {
 }
 
 conf_trend = {
+    "energy_industries": {
+        "entity": f"KYOTOGHG ({gwp_to_use})",
+        "unit": "GgCO2eq",
+        "replace_data_entries": {"NO,NE": "NO"},
+        "cat_codes_manual": {
+            "a. Public electricity and heat production": "1.A.1.a",
+            "b. Petroleum refining": "1.A.1.b",
+            "c. Manufacture of solid fuels": "1.A.1.c",
+        },
+        "header": ["orig_category"],
+        "years": [
+            "2008",
+            "2009",
+            "2010",
+            "2011",
+            "2012",
+            "2013",
+            "2014",
+            "2015",
+            "2016",
+            "2017",
+            "2018",
+        ],
+        "extra_columns": [],
+        "rows_to_fix": {3: ["a. Public electricity and heat"]},
+        "page_defs": {
+            "116": {
+                "read_params": dict(
+                    flavor="stream",
+                    table_areas=["72,426,543,333"],
+                    columns=["199,229,261,293,324,356,386,416,448,480,511"],
+                ),
+                "skip_rows_start": 2,
+            },
+        },
+    },
     "overview": {
         "fix_single_value": {
             "cat": "MBIO",
@@ -156,11 +192,11 @@ conf_trend = {
             "keep_value_no": 1,
         },
         "page_defs": {
-            "111": {"skip_rows_start": 1},
-            "112": {"skip_rows_start": 1},
-            "113": {"skip_rows_start": 1},
+            "111": {"read_params": dict(flavor="lattice"), "skip_rows_start": 1},
+            "112": {"read_params": dict(flavor="lattice"), "skip_rows_start": 1},
+            "113": {"read_params": dict(flavor="lattice"), "skip_rows_start": 1},
         },
-    }
+    },
 }
 
 conf = {

+ 17 - 47
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/read_KNA_BUR1_from_pdf.py

@@ -5,7 +5,7 @@ import camelot
 import pandas as pd
 import primap2 as pm2
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, fix_rows
 from unfccc_ghg_data.unfccc_reader.Saint_Kitts_and_Nevis.config_kna_bur1 import (
     conf,
     conf_general,
@@ -40,7 +40,7 @@ if __name__ == "__main__":
     # ###
 
     df_trend = None
-    for table in reversed(conf_trend.keys()):
+    for table in conf_trend.keys():
         print("-" * 45)
         print(f"Reading {table} trend table.")
         df_table = None
@@ -49,8 +49,9 @@ if __name__ == "__main__":
             tables_inventory_original = camelot.read_pdf(
                 str(input_folder / pdf_file),
                 pages=page,
-                flavor="lattice",
+                # flavor="lattice",
                 split_text=True,
+                **conf_trend[table]["page_defs"][page]["read_params"],
             )
 
             df_page = tables_inventory_original[0].df
@@ -72,6 +73,17 @@ if __name__ == "__main__":
                     join="outer",
                 ).reset_index(drop=True)
 
+        # fix content that spreads across multiple rows
+        if "rows_to_fix" in conf_trend[table]:
+            for n_rows in conf_trend[table]["rows_to_fix"].keys():
+                print(f"Merge content for {n_rows=}")
+                df_table = fix_rows(
+                    df_table,
+                    rows_to_fix=conf_trend[table]["rows_to_fix"][n_rows],
+                    col_to_use=0,
+                    n_rows=n_rows,
+                )
+
         df_table.columns = (
             conf_trend[table]["header"]
             + conf_trend[table]["years"]
@@ -139,50 +151,8 @@ if __name__ == "__main__":
                 join="outer",
             ).reset_index(drop=True)
 
-    #     # fill empty strings with NaN and the forward fill category names
-    #     df_page["category"] = df_page["category"].replace("", np.nan).ffill()
-    #
-    #     # remove /n from category names
-    #     df_page["category"] = df_page["category"].str.replace("\n", "")
-    #     # manual replacement of categories
-    #     df_page["category"] = df_page["category"].replace(
-    #         inv_conf_per_sector[sector]["cat_codes_manual"]
-    #     )
-    #
-    #     # remove all thousand separator commas
-    #     for year in trend_years :
-    #         df_page[year] = df_page[year].str.replace(",", ".")
-    #
-    #     # add unit
-    #     df_page["unit"] = inv_conf_per_sector[sector]["unit"]
-    #
-    #     # add entity if needed
-    #     if "entity" in inv_conf_per_sector[sector].keys() :
-    #         df_page["entity"] = inv_conf_per_sector[sector]["entity"]
-    #
-    #     if "unit_conversion" in inv_conf_per_sector[sector].keys() :
-    #         for year in trend_years :
-    #             index = inv_conf_per_sector[sector]["unit_conversion"]["index"]
-    #             conv_factor = inv_conf_per_sector[sector]["unit_conversion"][
-    #                 "conversion_factor"
-    #             ]
-    #             df_page.loc[index, year] = str(
-    #                 conv_factor * float(df_page.loc[index, year])
-    #             )
-    #
-    #     # stack the tables vertically
-    #     if df_trend is None :
-    #         df_trend = df_page
-    #     else :
-    #         df_trend = pd.concat(
-    #             [
-    #                 df_trend,
-    #                 df_page,
-    #             ],
-    #             axis=0,
-    #             join="outer",
-    #         ).reset_index(drop=True)
-    #
+        # break
+
     df_trend_if = pm2.pm2io.convert_wide_dataframe_if(
         df_trend,
         coords_cols=coords_cols,