Bläddra i källkod

all tables merged in pm2 format, saved raw

Daniel Busch 8 månader sedan
förälder
incheckning
1a6a6a0c69

+ 68 - 4
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/config_cpv_bur1.py

@@ -35,6 +35,15 @@ coords_defaults = {
 
 gwp_to_use = "SARGWP100"
 
+coords_value_mapping_main = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+    "entity": {
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "HFC": f"HFCS ({gwp_to_use})",
+    },
+}
+
 coords_value_mapping = {
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
@@ -48,6 +57,8 @@ coords_value_mapping = {
 
 filter_remove = {
     "f_memo": {"category": "MEMO"},
+    # They are all NaN and don't match a pre-defined entity
+    "f_fluor": {"entity": "Other fluorinated products"},
 }
 
 meta_data = {
@@ -90,7 +101,13 @@ inv_conf_per_sector = {
     },
 }
 
-inv_conf = {"cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*"}
+inv_conf = {
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+    "year": "2019",
+    # TODO check again!
+    # "CO2 emissions from Biomass" and "CO2 emissions from using manure as energy" are the same category
+    "merge_cats": "MBIO",
+}
 
 inv_conf_main = {
     "pages": {
@@ -111,8 +128,55 @@ inv_conf_main = {
             },
             # "units" : ["no unit", "Gg", "Gg", "Gg"]
         },
-        "87": {"skip_rows_start": 2},
-        "88": {"skip_rows_start": 2},
-        "89": {"skip_rows_start": 2},
+        "87": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O", "HFCs", "Other fluorinated products"],
+            "column_names": [
+                "category",
+                "CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+                "Other fluorinated products",
+            ],
+            "cat_codes_manual": {
+                "2F. Use of products as substitutes for \nsubstances that degrade the ozone layer": "2.F",
+                "2B4. Production of caprolactam, \nglyoxal and glyoxylic acid": "2.B.4",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+                "HFCs": "Gg CO2eq",
+            },
+        },
+        "88": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+        },
+        "89": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+        },
     },
 }

+ 68 - 11
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/read_CPV_BUR1_from_pdf.py

@@ -13,6 +13,8 @@ from unfccc_ghg_data.unfccc_reader.Cabo_Verde.config_cpv_bur1 import (
     coords_defaults,
     coords_terminologies,
     coords_value_mapping,
+    coords_value_mapping_main,
+    filter_remove,
     inv_conf,
     inv_conf_main,
     inv_conf_per_sector,
@@ -41,11 +43,12 @@ if __name__ == "__main__":
     compression = dict(zlib=True, complevel=9)
 
     # ###
-    # 2. Read sector-specific main tables for 2019
+    # 1. Read sector-specific main tables for 2019
     # ###
 
     df_main = None
-    for page in inv_conf_main["pages"].keys():
+    for page in reversed(inv_conf_main["pages"].keys()):
+        print(f"Read table on page {page}")
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file),
             pages=page,
@@ -105,26 +108,54 @@ if __name__ == "__main__":
                 axis=0,
                 join="outer",
             ).reset_index(drop=True)
-        break
 
-    df_main_if = pm2.pm2io.convert_wide_dataframe_if(
+    df_main["time"] = inv_conf["year"]
+
+    # remove wrong codes in data column
+    df_main["data"] = df_main["data"].str.replace("HFC", "")
+
+    # Sum up the values for duplicate categories
+    cat = inv_conf["merge_cats"]
+    df_temp = df_main.loc[df_main["category"] == cat]
+    df_temp["data"] = df_temp["data"].replace("", np.nan).apply(float)
+    df_temp = df_temp.groupby(
+        ["category", "entity", "unit", "time"], as_index=False
+    ).sum()
+    # change back to empty strings
+    df_temp = df_temp.replace(0, "")
+    # drop category from df
+    df_main = df_main.drop(df_main[df_main["category"] == cat].index)
+    # append the summed up sub-set
+    df_main = pd.concat(
+        [df_main, df_temp],
+        axis=0,
+        join="outer",
+    ).reset_index(drop=True)
+
+    df_main_if = pm2.pm2io.convert_long_dataframe_if(
         df_main,
         coords_cols=coords_cols,
         # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
-        coords_value_mapping=coords_value_mapping,
+        coords_value_mapping=coords_value_mapping_main,
         # coords_value_filling=coords_value_filling,
-        # filter_remove=filter_remove,
+        filter_remove=filter_remove,
         # filter_keep=filter_keep,
         meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_if)
+
     # ###
-    # 1. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
+    # 2. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
     # ###
     df_trend = None
-    for sector in inv_conf_per_sector.keys():
+    for sector in reversed(inv_conf_per_sector.keys()):
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file),
             pages=inv_conf_per_sector[sector]["page"],
@@ -180,7 +211,7 @@ if __name__ == "__main__":
                 join="outer",
             ).reset_index(drop=True)
 
-    data_if = pm2.pm2io.convert_wide_dataframe_if(
+    df_trend_if = pm2.pm2io.convert_wide_dataframe_if(
         df_trend,
         coords_cols=coords_cols,
         # add_coords_cols=add_coords_cols,
@@ -188,11 +219,37 @@ if __name__ == "__main__":
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         # coords_value_filling=coords_value_filling,
-        # filter_remove=filter_remove,
+        filter_remove=filter_remove,
         # filter_keep=filter_keep,
         meta_data=meta_data,
     )
 
     ### convert to primap2 format ###
     print("Converting to primap2 format.")
-    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+    data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_if)
+
+    # ###
+    # Merge the main table for 2019 and the trend tables
+    # ###
+
+    print("Merging main table and trend tables")
+    print("Merging waste table.")
+    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2)  # , tolerance=0.10)
+
+    # # ###
+    # # Save raw data to IF and native format.
+    # # ###
+
+    data_if = data_pm2.pr.to_interchange_format()
+
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
+
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )