Browse Source

first sector table

Daniel Busch 8 months ago
parent
commit
06f52b18bf

+ 15 - 5
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/config_kna_bur1.py

@@ -2,13 +2,23 @@
 Configuration file to read Saint Kitts and Nevis' BUR 1.
 """
 
+conf_general = {
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+}
+
 conf = {
     "energy": {
+        "header": ["orig_category", "CO2", "CH4", "N2O", "NOX", "CO", "NMVOCs", "SO2"],
+        "unit": [8 * "Gg"],
+        "cat_codes_manual": {
+            "Information Items": "MEMO",
+            "CO2 from Biomass Combustion for Energy Production": "MBIO",
+        },
         "page_defs": {
-            "149": {"skip_rows_start": 0},
-            "150": {"skip_rows_start": 0},
-            "151": {"skip_rows_start": 0},
-            "152": {"skip_rows_start": 0},
-        }
+            "149": {"skip_rows_start": 2},
+            "150": {"skip_rows_start": 2},
+            "151": {"skip_rows_start": 2},
+            "152": {"skip_rows_start": 2},
+        },
     }
 }

+ 29 - 1
src/unfccc_ghg_data/unfccc_reader/Saint_Kitts_and_Nevis/read_KNA_BUR1_from_pdf.py

@@ -5,7 +5,10 @@ import camelot
 import pandas as pd
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-from unfccc_ghg_data.unfccc_reader.Saint_Kitts_and_Nevis.config_kna_bur1 import conf
+from unfccc_ghg_data.unfccc_reader.Saint_Kitts_and_Nevis.config_kna_bur1 import (
+    conf,
+    conf_general,
+)
 
 if __name__ == "__main__":
     # ###
@@ -35,6 +38,7 @@ if __name__ == "__main__":
 
         df_sector = None
         for page in conf[sector]["page_defs"].keys():
+            print(f"Page {page}")
             tables_inventory_original = camelot.read_pdf(
                 str(input_folder / pdf_file),
                 pages=page,
@@ -44,6 +48,10 @@ if __name__ == "__main__":
 
             df_page = tables_inventory_original[0].df
 
+            skip_rows_start = conf[sector]["page_defs"][page]["skip_rows_start"]
+            if not skip_rows_start == 0:
+                df_page = df_page[skip_rows_start:]
+
             if df_sector is None:
                 df_sector = df_page
             else:
@@ -56,4 +64,24 @@ if __name__ == "__main__":
                     join="outer",
                 ).reset_index(drop=True)
 
+        df_sector.columns = conf[sector]["header"]
+
+        df_sector["category"] = df_sector["orig_category"]
+
+        # Remove line break characters
+        df_sector["category"] = df_sector["category"].str.replace("\n", " ")
+
+        # first the manual replacements
+        df_sector["category"] = df_sector["category"].replace(
+            conf[sector]["cat_codes_manual"]
+        )
+
+        # then the regex replacements
+        df_sector["category"] = df_sector["category"].str.replace(
+            conf_general["cat_code_regexp"], repl, regex=True
+        )
+
+        df_sector = df_sector.drop(columns="orig_category")
+        pass
+
         pass