Daniel Busch 9 месяцев назад
Родитель
Сommit
2c5616c8fa

+ 23 - 1
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/config_cpv_bur1.py

@@ -46,6 +46,10 @@ coords_value_mapping = {
     },
 }
 
+filter_remove = {
+    "f_memo": {"category": "MEMO"},
+}
+
 meta_data = {
     "references": "https://unfccc.int/sites/default/files/resource/BUR_EN_Digital.pdf",  # TODO check other sources
     "rights": "",  # unknown
@@ -86,9 +90,27 @@ inv_conf_per_sector = {
     },
 }
 
+inv_conf = {"cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*"}
+
 inv_conf_main = {
     "pages": {
-        "86": {"skip_rows_start": 2},
+        "86": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "Memo items": "MEMO",
+                "International bunkers": "M.BK",
+                "CO² emissions from Biomass": "M.BIO",
+                "CO² emissions from using manure as energy": "M.BIO",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+            # "units" : ["no unit", "Gg", "Gg", "Gg"]
+        },
         "87": {"skip_rows_start": 2},
         "88": {"skip_rows_start": 2},
         "89": {"skip_rows_start": 2},

+ 54 - 1
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/read_CPV_BUR1_from_pdf.py

@@ -13,6 +13,7 @@ from unfccc_ghg_data.unfccc_reader.Cabo_Verde.config_cpv_bur1 import (
     coords_defaults,
     coords_terminologies,
     coords_value_mapping,
+    inv_conf,
     inv_conf_main,
     inv_conf_per_sector,
     meta_data,
@@ -24,6 +25,10 @@ if __name__ == "__main__":
     # configuration
     # ###
 
+    # for regex later
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
     input_folder = downloaded_data_path / "UNFCCC" / "Cabo_Verde" / "BUR1"
     output_folder = extracted_data_path / "UNFCCC" / "Cabo_Verde"
 
@@ -54,6 +59,40 @@ if __name__ == "__main__":
         if not skip_rows_start == 0:
             df_page = df_page[skip_rows_start:]
 
+        df_page.columns = inv_conf_main["pages"][page]["column_names"]
+
+        # first the manual replacements
+        df_page["category"] = df_page["category"].replace(
+            inv_conf_main["pages"][page]["cat_codes_manual"]
+        )
+
+        # Remove dots between letters in category codes
+        df_page["category"] = df_page["category"].str.replace(".", "")
+        # Some categories have a dash between the letters
+        df_page["category"] = df_page["category"].str.replace("-", " ")
+
+        # then the regex replacements
+        df_page["category"] = df_page["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
+
+        df_page = pd.melt(
+            df_page,
+            id_vars="category",
+            value_vars=inv_conf_main["pages"][page]["entities"],
+        )
+
+        df_page = df_page.rename({"value": "data", "variable": "entity"}, axis=1)
+
+        df_page["data"] = df_page["data"].str.replace(",", ".")
+
+        # df_page["unit"] = df_page["entity"]
+
+        # set unit based on entity
+        df_page["unit"] = df_page["entity"].replace(
+            inv_conf_main["pages"][page]["unit_for_entity"]
+        )
+
         # stack the tables vertically
         if df_main is None:
             df_main = df_page
@@ -66,6 +105,20 @@ if __name__ == "__main__":
                 axis=0,
                 join="outer",
             ).reset_index(drop=True)
+        break
+
+    df_main_if = pm2.pm2io.convert_wide_dataframe_if(
+        df_main,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+    )
 
     # ###
     # 1. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
@@ -105,7 +158,7 @@ if __name__ == "__main__":
 
         # remove all thousand separator commas
         for year in trend_years:
-            df_page[year] = df_page[year].str.replace(",", "")
+            df_page[year] = df_page[year].str.replace(",", ".")
 
         # add unit
         df_page["unit"] = inv_conf_per_sector[sector]["unit"]