8 månader sedan · 1a6a6a0c69
--- a/src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/config_cpv_bur1.py
+++ b/src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/config_cpv_bur1.py
@@ -35,6 +35,15 @@ coords_defaults = {
 
				 
			
 
				 gwp_to_use = "SARGWP100"
			
 
				 
			
 
				+coords_value_mapping_main = {
			
 
				+    "unit": "PRIMAP1",
			
 
				+    "category": "PRIMAP1",
			
 
				+    "entity": {
			
 
				+        "HFCs": f"HFCS ({gwp_to_use})",
			
 
				+        "HFC": f"HFCS ({gwp_to_use})",
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				 coords_value_mapping = {
			
 
				     "unit": "PRIMAP1",
			
 
				     "category": "PRIMAP1",
			
@@ -48,6 +57,8 @@ coords_value_mapping = {
 
				 
			
 
				 filter_remove = {
			
 
				     "f_memo": {"category": "MEMO"},
			
 
				+    # They are all NaN and don't match a pre-defined entity
			
 
				+    "f_fluor": {"entity": "Other fluorinated products"},
			
 
				 }
			
 
				 
			
 
				 meta_data = {
			
@@ -90,7 +101,13 @@ inv_conf_per_sector = {
 
				     },
			
 
				 }
			
 
				 
			
 
				-inv_conf = {"cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*"}
			
 
				+inv_conf = {
			
 
				+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
			
 
				+    "year": "2019",
			
 
				+    # TODO check again!
			
 
				+    # "CO2 emissions from Biomass" and "CO2 emissions from using manure as energy" are the same category
			
 
				+    "merge_cats": "MBIO",
			
 
				+}
			
 
				 
			
 
				 inv_conf_main = {
			
 
				     "pages": {
			
@@ -111,8 +128,55 @@ inv_conf_main = {
 
				             },
			
 
				             # "units" : ["no unit", "Gg", "Gg", "Gg"]
			
 
				         },
			
 
				-        "87": {"skip_rows_start": 2},
			
 
				-        "88": {"skip_rows_start": 2},
			
 
				-        "89": {"skip_rows_start": 2},
			
 
				+        "87": {
			
 
				+            "skip_rows_start": 2,
			
 
				+            "entities": ["CO2", "CH4", "N2O", "HFCs", "Other fluorinated products"],
			
 
				+            "column_names": [
			
 
				+                "category",
			
 
				+                "CO2",
			
 
				+                "CH4",
			
 
				+                "N2O",
			
 
				+                "HFCs",
			
 
				+                "Other fluorinated products",
			
 
				+            ],
			
 
				+            "cat_codes_manual": {
			
 
				+                "2F. Use of products as substitutes for \nsubstances that degrade the ozone layer": "2.F",
			
 
				+                "2B4. Production of caprolactam, \nglyoxal and glyoxylic acid": "2.B.4",
			
 
				+            },
			
 
				+            "unit_for_entity": {
			
 
				+                "CO2": "Gg",
			
 
				+                "CH4": "Gg",
			
 
				+                "N2O": "Gg",
			
 
				+                "HFCs": "Gg CO2eq",
			
 
				+            },
			
 
				+        },
			
 
				+        "88": {
			
 
				+            "skip_rows_start": 2,
			
 
				+            "entities": ["CO2", "CH4", "N2O"],
			
 
				+            "column_names": ["category", "CO2", "CH4", "N2O"],
			
 
				+            "cat_codes_manual": {
			
 
				+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
			
 
				+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
			
 
				+            },
			
 
				+            "unit_for_entity": {
			
 
				+                "CO2": "Gg",
			
 
				+                "CH4": "Gg",
			
 
				+                "N2O": "Gg",
			
 
				+            },
			
 
				+        },
			
 
				+        "89": {
			
 
				+            "skip_rows_start": 2,
			
 
				+            "entities": ["CO2", "CH4", "N2O"],
			
 
				+            "column_names": ["category", "CO2", "CH4", "N2O"],
			
 
				+            "cat_codes_manual": {
			
 
				+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
			
 
				+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
			
 
				+            },
			
 
				+            "unit_for_entity": {
			
 
				+                "CO2": "Gg",
			
 
				+                "CH4": "Gg",
			
 
				+                "N2O": "Gg",
			
 
				+            },
			
 
				+        },
			
 
				     },
			
 
				 }
			
--- a/src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/read_CPV_BUR1_from_pdf.py
+++ b/src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/read_CPV_BUR1_from_pdf.py
@@ -13,6 +13,8 @@ from unfccc_ghg_data.unfccc_reader.Cabo_Verde.config_cpv_bur1 import (
 
				     coords_defaults,
			
 
				     coords_terminologies,
			
 
				     coords_value_mapping,
			
 
				+    coords_value_mapping_main,
			
 
				+    filter_remove,
			
 
				     inv_conf,
			
 
				     inv_conf_main,
			
 
				     inv_conf_per_sector,
			
@@ -41,11 +43,12 @@ if __name__ == "__main__":
 
				     compression = dict(zlib=True, complevel=9)
			
 
				 
			
 
				     # ###
			
 
				-    # 2. Read sector-specific main tables for 2019
			
 
				+    # 1. Read sector-specific main tables for 2019
			
 
				     # ###
			
 
				 
			
 
				     df_main = None
			
 
				-    for page in inv_conf_main["pages"].keys():
			
 
				+    for page in reversed(inv_conf_main["pages"].keys()):
			
 
				+        print(f"Read table on page {page}")
			
 
				         tables_inventory_original = camelot.read_pdf(
			
 
				             str(input_folder / pdf_file),
			
 
				             pages=page,
			
@@ -105,26 +108,54 @@ if __name__ == "__main__":
 
				                 axis=0,
			
 
				                 join="outer",
			
 
				             ).reset_index(drop=True)
			
 
				-        break
			
 
				 
			
 
				-    df_main_if = pm2.pm2io.convert_wide_dataframe_if(
			
 
				+    df_main["time"] = inv_conf["year"]
			
 
				+
			
 
				+    # remove wrong codes in data column
			
 
				+    df_main["data"] = df_main["data"].str.replace("HFC", "")
			
 
				+
			
 
				+    # Sum up the values for duplicate categories
			
 
				+    cat = inv_conf["merge_cats"]
			
 
				+    df_temp = df_main.loc[df_main["category"] == cat]
			
 
				+    df_temp["data"] = df_temp["data"].replace("", np.nan).apply(float)
			
 
				+    df_temp = df_temp.groupby(
			
 
				+        ["category", "entity", "unit", "time"], as_index=False
			
 
				+    ).sum()
			
 
				+    # change back to empty strings
			
 
				+    df_temp = df_temp.replace(0, "")
			
 
				+    # drop category from df
			
 
				+    df_main = df_main.drop(df_main[df_main["category"] == cat].index)
			
 
				+    # append the summed up sub-set
			
 
				+    df_main = pd.concat(
			
 
				+        [df_main, df_temp],
			
 
				+        axis=0,
			
 
				+        join="outer",
			
 
				+    ).reset_index(drop=True)
			
 
				+
			
 
				+    df_main_if = pm2.pm2io.convert_long_dataframe_if(
			
 
				         df_main,
			
 
				         coords_cols=coords_cols,
			
 
				         # add_coords_cols=add_coords_cols,
			
 
				         coords_defaults=coords_defaults,
			
 
				         coords_terminologies=coords_terminologies,
			
 
				-        coords_value_mapping=coords_value_mapping,
			
 
				+        coords_value_mapping=coords_value_mapping_main,
			
 
				         # coords_value_filling=coords_value_filling,
			
 
				-        # filter_remove=filter_remove,
			
 
				+        filter_remove=filter_remove,
			
 
				         # filter_keep=filter_keep,
			
 
				         meta_data=meta_data,
			
 
				+        convert_str=True,
			
 
				+        time_format="%Y",
			
 
				     )
			
 
				 
			
 
				+    ### convert to primap2 format ###
			
 
				+    print("Converting to primap2 format.")
			
 
				+    data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_if)
			
 
				+
			
 
				     # ###
			
 
				-    # 1. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
			
 
				+    # 2. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
			
 
				     # ###
			
 
				     df_trend = None
			
 
				-    for sector in inv_conf_per_sector.keys():
			
 
				+    for sector in reversed(inv_conf_per_sector.keys()):
			
 
				         tables_inventory_original = camelot.read_pdf(
			
 
				             str(input_folder / pdf_file),
			
 
				             pages=inv_conf_per_sector[sector]["page"],
			
@@ -180,7 +211,7 @@ if __name__ == "__main__":
 
				                 join="outer",
			
 
				             ).reset_index(drop=True)
			
 
				 
			
 
				-    data_if = pm2.pm2io.convert_wide_dataframe_if(
			
 
				+    df_trend_if = pm2.pm2io.convert_wide_dataframe_if(
			
 
				         df_trend,
			
 
				         coords_cols=coords_cols,
			
 
				         # add_coords_cols=add_coords_cols,
			
@@ -188,11 +219,37 @@ if __name__ == "__main__":
 
				         coords_terminologies=coords_terminologies,
			
 
				         coords_value_mapping=coords_value_mapping,
			
 
				         # coords_value_filling=coords_value_filling,
			
 
				-        # filter_remove=filter_remove,
			
 
				+        filter_remove=filter_remove,
			
 
				         # filter_keep=filter_keep,
			
 
				         meta_data=meta_data,
			
 
				     )
			
 
				 
			
 
				     ### convert to primap2 format ###
			
 
				     print("Converting to primap2 format.")
			
 
				-    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
			
 
				+    data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_if)
			
 
				+
			
 
				+    # ###
			
 
				+    # Merge the main table for 2019 and the trend tables
			
 
				+    # ###
			
 
				+
			
 
				+    print("Merging main table and trend tables")
			
 
				+    print("Merging waste table.")
			
 
				+    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2)  # , tolerance=0.10)
			
 
				+
			
 
				+    # # ###
			
 
				+    # # Save raw data to IF and native format.
			
 
				+    # # ###
			
 
				+
			
 
				+    data_if = data_pm2.pr.to_interchange_format()
			
 
				+
			
 
				+    pm2.pm2io.write_interchange_format(
			
 
				+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
			
 
				+        data_if,
			
 
				+    )
			
 
				+
			
 
				+    encoding = {var: compression for var in data_pm2.data_vars}
			
 
				+    data_pm2.pr.to_netcdf(
			
 
				+        output_folder
			
 
				+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
			
 
				+        encoding=encoding,
			
 
				+    )