Browse Source

function to replace single values

Daniel Busch 11 months ago
parent
commit
57af954fe7

+ 1 - 0
.gitignore

@@ -1,6 +1,7 @@
 .idea
 .DS_Store
 venv
+notebooks
 geckodriver.log
 __pycache__
 /JG_test_code/

+ 67 - 0
UNFCCC_GHG_data/UNFCCC_reader/Guinea/config_GIN_BUR1.py

@@ -277,3 +277,70 @@ gas_baskets = {
     "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
     "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
+
+replace_info = {
+    'main' : [
+        ("3", "CO", "2019", 27.406),
+        ("3.C", "CO", "2019", 27.406),
+        ("3.C.1", "CO", "2019", 27.406),
+        ("3", "N2O", "1990", 2.190),
+        ("3","NOx","2019",1.644),
+        ("3.C","NOx","2019",1.644),
+        ("3.C.1","NOx","2019",1.644),
+        ("M.BK","NOx","1990",0.001),
+        ("M.BK","NOx","2000",0.003),
+        ("M.BK","NOx","2010",0.052),
+        ("M.BK","CO","1990",0.0002),
+        ("M.BK","CO","2000",0.0006),
+        ("M.BK","CO","2010",0.01),
+        ("M.BK","NMVOC","1990",0.0001),
+        ("M.BK","NMVOC","2000",0.0002),
+        ("M.BK","NMVOC","2010",0.003),
+],
+    'trend' : [
+    ("M.BK","CH4","1990"),
+    ("M.BK.A","CH4","1990"),
+    ("M.BK","CH4","2000"),
+    ("M.BK.A","CH4","2000"),
+    ("M.BK","CH4","2010"),
+    ("M.BK.A","CH4","2010"),
+    ("1.A.2","N2O","1990"),
+    ("M.BK","N2O","1990"),
+    ("M.BK.A","N2O","1990"),
+    ("M.BK","N2O","2000"),
+    ("M.BK.A","N2O","2000"),
+    ("M.BK","N2O","2010"),
+    ("M.BK.A","N2O","2010"),
+    ("M.BK","N2O","2019"),
+    ("M.BK.A","N2O","2019"),
+    ("M.BK","NOx","1990"),
+    ("M.BK","NOx","2000"),
+    ("M.BK","NOx","2010"),
+    ("3.C","NOx","2019"),
+    ("3.C.1","NOx","2019"),
+    ("3","NOx","2019"),
+    ("1.A.2","NMVOC", "1990"),
+    ("M.BK","NMVOC", "1990"),
+    ("0","NMVOC", "2000"),
+    ("1","NMVOC", "2000"),
+    ("1.A","NMVOC", "2000"),
+    ("1.A.1","NMVOC", "2000"),
+    ("1.A.2","NMVOC", "2000"),
+    ("1.A.3","NMVOC", "2000"),
+    ("1.A.4","NMVOC", "2000"),
+    ("2","NMVOC", "2000"),
+    ("2.H","NMVOC", "2000"),
+    ("2.H.2","NMVOC", "2000"),
+    ("M.BK","NMVOC", "2000"),
+    ("0","NMVOC", "2010"),
+    ("1","NMVOC", "2010"),
+    ("1.A","NMVOC", "2010"),
+    ("1.A.1","NMVOC", "2010"),
+    ("1.A.2","NMVOC", "2010"),
+    ("1.A.3","NMVOC", "2010"),
+    ("1.A.4","NMVOC", "2010"),
+    ("2","NMVOC", "2010"),
+    ("M.BK","NMVOC", "2010"),
+    ("1.A.2","NMVOC", "2019"),
+],
+}

+ 107 - 279
UNFCCC_GHG_data/UNFCCC_reader/Guinea/read_GIN_BUR1_from_pdf.py

@@ -14,6 +14,7 @@ from datetime import date
 import xarray as xr
 
 from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper.functions import find_and_replace_values
 from config_GIN_BUR1 import coords_cols, coords_defaults, coords_terminologies
 from config_GIN_BUR1 import (
     coords_value_mapping,
@@ -21,7 +22,7 @@ from config_GIN_BUR1 import (
     meta_data,
     page_def_templates,
 )
-from config_GIN_BUR1 import inv_conf, country_processing_step1, gas_baskets
+from config_GIN_BUR1 import inv_conf, country_processing_step1, gas_baskets, replace_info
 
 # ###
 # configuration
@@ -29,7 +30,7 @@ from config_GIN_BUR1 import inv_conf, country_processing_step1, gas_baskets
 
 input_folder = downloaded_data_path / "UNFCCC" / "Guinea" / "BUR1"
 output_folder = extracted_data_path / "UNFCCC" / "Guinea"
-if not output_folder.exists():
+if not output_folder.exists() :
     output_folder.mkdir()
 
 pdf_file = "Rapport_IGES-Guinee-BUR1_VF.pdf"
@@ -43,7 +44,7 @@ compression = dict(zlib=True, complevel=9)
 
 pages = ["110", "111", "112", "113"]
 df_main = None
-for page in pages:
+for page in pages :
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -61,7 +62,7 @@ for page in pages:
     df_inventory = tables_inventory_original[0].df.copy()
 
     # move broken text in correct row (page 113 is fine)
-    if page in ["110", "111", "112"]:
+    if page in ["110", "111", "112"] :
         df_inventory.at[4, 0] = "1.A.1 - Industries énergétiques"
         df_inventory = df_inventory.drop(index=3)
         df_inventory.at[8, 0] = "1.A.4 - Autres secteurs"
@@ -103,8 +104,12 @@ for page in pages:
 
     df_inventory_long["category"] = df_inventory_long["category"].str.replace(".", "")
 
+
     # regex replacements
-    repl = lambda m: m.group("code")
+    def repl(m) :
+        return m.group("code")
+
+
     df_inventory_long["category"] = df_inventory_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -118,9 +123,9 @@ for page in pages:
     df_inventory_long.columns = df_inventory_long.columns.map(str)
     df_inventory_long = df_inventory_long.drop(columns=["orig_cat_name"])
 
-    if df_main is None:
+    if df_main is None :
         df_main = df_inventory_long
-    else:
+    else :
         df_main = pd.concat(
             [df_main, df_inventory_long],
             axis=0,
@@ -140,85 +145,10 @@ df_all_IF = pm2.pm2io.convert_long_dataframe_if(
     time_format="%Y",
 )
 
-# There are inconsistent values in the main and the afolu table
-# It looks like they put the values from 1990 again for 2019 in the main table.
-# The values from the afolu table are assumed to be the correct ones.
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3") & (df_all_IF["entity"] == "CO"),
-    "2019",
-] = 27.406
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3.C") & (df_all_IF["entity"] == "CO"),
-    "2019",
-] = 27.406
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3.C.1") & (df_all_IF["entity"] == "CO"),
-    "2019",
-] = 27.406
-
-# Values for category 3 and N2O are identical for 1990 and 2019
-# The sum of the sub-categories does not equal the value of the parent category
-# The value  in the afolu table should therefore be the correct one
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3") & (df_all_IF["entity"] == "N2O"),
-    "1990",
-] = 2.190
-
-# Values for category 3 and NOx are identical for 1990 and 2019
-# Replacing the duplicate value with the value from the afolu table
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3") & (df_all_IF["entity"] == "NOx"),
-    "2019",
-] = 1.644
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3.C") & (df_all_IF["entity"] == "NOx"),
-    "2019",
-] = 1.644
-df_all_IF.loc[
-    (df_all_IF[category_column] == "3.C.1") & (df_all_IF["entity"] == "NOx"),
-    "2019",
-] = 1.644
-
-# International bunkers
-# NOx
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NOx"),
-    "1990",
-] = 0.001
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NOx"),
-    "2000",
-] = 0.003
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NOx"),
-    "2010",
-] = 0.052
-# CO
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "CO"),
-    "1990",
-] = 0.0002
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "CO"),
-    "2000",
-] = 0.0006
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "CO"),
-    "2010",
-] = 0.01
-# NMVOC
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NMVOC"),
-    "1990",
-] = 0.0001
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NMVOC"),
-    "2000",
-] = 0.0002
-df_all_IF.loc[
-    (df_all_IF[category_column] == "M.BK") & (df_all_IF["entity"] == "NMVOC"),
-    "2010",
-] = 0.003
+df_all_IF = find_and_replace_values(df=df_all_IF,
+                                    replace_info=replace_info['main'],
+                                    category_column=category_column
+                                    )
 
 ### convert to primap2 format ###
 data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
@@ -229,7 +159,7 @@ data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
 
 pages = ["116", "117", "118", "119"]
 df_energy = None
-for page in pages:
+for page in pages :
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -241,7 +171,7 @@ for page in pages:
 
     # cut last two lines of second table to ignore additional information regarding biomass for energy production
     df_energy_year = pd.concat(
-        [tables_inventory_original[0].df[2:], tables_inventory_original[1].df[3:-2]],
+        [tables_inventory_original[0].df[2 :], tables_inventory_original[1].df[3 :-2]],
         axis=0,
         join="outer",
     ).reset_index(drop=True)
@@ -249,19 +179,19 @@ for page in pages:
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.3.a.i - Aviation internationale (Soutes internationales)"
-    ][0]
+        ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.3.d.i - Navigation internationale (soutes internationales)"
-    ][0]
+        ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.5.c - Opérations multilatérales (Éléments pour information)"
-    ][0]
+        ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     # add header and unit
@@ -309,8 +239,12 @@ for page in pages:
         ".", ""
     )
 
+
     # then the regex replacements
-    repl = lambda m: m.group("code")
+    def repl(m) :
+        return m.group("code")
+
+
     df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -324,9 +258,9 @@ for page in pages:
     df_energy_year_long.columns = df_energy_year_long.columns.map(str)
     df_energy_year_long = df_energy_year_long.drop(columns=["orig_cat_name"])
 
-    if df_energy is None:
+    if df_energy is None :
         df_energy = df_energy_year_long
-    else:
+    else :
         df_energy = pd.concat(
             [df_energy, df_energy_year_long],
             axis=0,
@@ -349,14 +283,13 @@ df_energy_IF = pm2.pm2io.convert_long_dataframe_if(
 ### convert to primap2 format ###
 data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)
 
-
 # ###
 # 3. Read in afolu table
 # ###
 
 pages = ["124", "125", "126", "127"]
 df_afolu = None
-for page in pages:
+for page in pages :
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -365,10 +298,10 @@ for page in pages:
     )
     print("Reading complete.")
 
-    if page == "127":
+    if page == "127" :
         # table on page 127 has one extra row at the top
         # and one extra category 3.A.1.j
-        df_afolu_year = tables_inventory_original[0].df[3:]
+        df_afolu_year = tables_inventory_original[0].df[3 :]
         # 3.A.1.a.i to 3.A.1.j exist twice.
         # Rename duplicate categories in tables.
         replace_categories = [
@@ -384,11 +317,11 @@ for page in pages:
             (28, "3.A.2.i - Volailles"),
             (29, "3.A.2.j - Autres (préciser)"),
         ]
-        for index, category_name in replace_categories:
+        for index, category_name in replace_categories :
             df_afolu_year.at[index, 0] = category_name
-    else:
+    else :
         # cut first two lines
-        df_afolu_year = tables_inventory_original[0].df[2:]
+        df_afolu_year = tables_inventory_original[0].df[2 :]
         # On pages 124-126 the wrong categories are slightly different
         replace_categories = [
             (17, "3.A.2.a.i - Vaches laitières"),
@@ -402,7 +335,7 @@ for page in pages:
             (25, "3.A.2.h - Porcins"),
             (26, "3.A.2.i - Volailles"),
         ]
-        for index, category_name in replace_categories:
+        for index, category_name in replace_categories :
             df_afolu_year.at[index, 0] = category_name
 
     # add header and unit
@@ -439,8 +372,12 @@ for page in pages:
     # make a copy of the categories row
     df_afolu_year_long["category"] = df_afolu_year_long["orig_cat_name"]
 
+
     # regex replacements
-    repl = lambda m: m.group("code")
+    def repl(m) :
+        return m.group("code")
+
+
     df_afolu_year_long["category"] = df_afolu_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -454,9 +391,9 @@ for page in pages:
     df_afolu_year_long.columns = df_afolu_year_long.columns.map(str)
     df_afolu_year_long = df_afolu_year_long.drop(columns=["orig_cat_name"])
 
-    if df_afolu is None:
+    if df_afolu is None :
         df_afolu = df_afolu_year_long
-    else:
+    else :
         df_afolu = pd.concat(
             [df_afolu, df_afolu_year_long],
             axis=0,
@@ -500,18 +437,18 @@ tables_inventory_original_130 = camelot.read_pdf(
 
 # save to dict
 df_waste_years = {
-    "1990": tables_inventory_original_128[0].df,
-    "2000": tables_inventory_original_128[1].df,
-    "2010": tables_inventory_original_128[2].df,
-    "2019": tables_inventory_original_130[0].df,
+    "1990" : tables_inventory_original_128[0].df,
+    "2000" : tables_inventory_original_128[1].df,
+    "2010" : tables_inventory_original_128[2].df,
+    "2019" : tables_inventory_original_130[0].df,
 }
 
 df_waste = None
-for year in df_waste_years.keys():
+for year in df_waste_years.keys() :
     print("-" * 45)
     print(f"Processing table for {year}.")
 
-    df_waste_year = df_waste_years[year][2:]
+    df_waste_year = df_waste_years[year][2 :]
 
     # add header and unit
     df_header = pd.DataFrame([inv_conf["header_waste"], inv_conf["unit_waste"]])
@@ -545,8 +482,12 @@ for year in df_waste_years.keys():
     # make a copy of the categories row
     df_waste_year_long["category"] = df_waste_year_long["orig_cat_name"]
 
+
     # regex replacements
-    repl = lambda m: m.group("code")
+    def repl(m) :
+        return m.group("code")
+
+
     df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -561,9 +502,9 @@ for year in df_waste_years.keys():
     df_waste_year_long.columns = df_waste_year_long.columns.map(str)
     df_waste_year_long = df_waste_year_long.drop(columns=["orig_cat_name"])
 
-    if df_waste is None:
+    if df_waste is None :
         df_waste = df_waste_year_long
-    else:
+    else :
         df_waste = pd.concat(
             [df_waste, df_waste_year_long],
             axis=0,
@@ -595,7 +536,7 @@ pages = ["131", "132", "133", "134", "135", "136", "137"]
 entities = ["CO2", "CH4", "N2O", "NOx", "CO", "NMVOCs", "SO2"]
 
 # for this set of tables every page is a different entity
-for page, entity in zip(pages, entities):
+for page, entity in zip(pages, entities) :
     # The table for CO seems completely mixed up and should not be considered.
     # The total CO values for 1990 equal the values in the main table.
     # The total CO values for 1995 equal the values for 2000 in the main table.
@@ -604,7 +545,7 @@ for page, entity in zip(pages, entities):
     # The total CO values for 2010 are identical to the 1990 values in the same table.
     # The total CO values for 2019 are identical to the 1995 values in the same table.
     # And so on.
-    if entity == "CO":
+    if entity == "CO" :
         continue
 
     print("-" * 45)
@@ -615,7 +556,7 @@ for page, entity in zip(pages, entities):
     # see https://github.com/atlanhq/camelot/issues/306,
     # or because characters in first row almost touch
     # the table grid.
-    if page == "131":
+    if page == "131" :
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file),
             pages=page,
@@ -625,7 +566,7 @@ for page, entity in zip(pages, entities):
             split_text=True,
         )
 
-        df_trend_entity = tables_inventory_original[0].df[1:]
+        df_trend_entity = tables_inventory_original[0].df[1 :]
 
         # The categories 3.D / 3.D.1 / 3.D.2 contain values different to the main table
         # They should also not contain negative values according to IPCC methodology:
@@ -636,19 +577,19 @@ for page, entity in zip(pages, entities):
 
         row_to_delete = df_trend_entity.index[
             df_trend_entity[0] == "3.D.1 - Produits ligneux récoltés"
-        ][0]
+            ][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
         row_to_delete = df_trend_entity.index[
             df_trend_entity[0] == "3.D.2 - Autres (veuillez spécifier)"
-        ][0]
+            ][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
-    else:
+    else :
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
         )
-        df_trend_entity = tables_inventory_original[0].df[3:]
+        df_trend_entity = tables_inventory_original[0].df[3 :]
 
     print("Reading complete.")
 
@@ -677,7 +618,7 @@ for page, entity in zip(pages, entities):
     df_trend_entity.loc[:, "category"] = df_trend_entity["orig_cat_name"]
 
     # Delete empty line for pages 132-137.
-    if page != "131":
+    if page != "131" :
         row_to_delete = df_trend_entity.index[df_trend_entity["category"] == ""][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
@@ -692,7 +633,11 @@ for page, entity in zip(pages, entities):
         "\n", ""
     )
 
-    repl = lambda m: m.group("code")
+
+    def repl(m) :
+        return m.group("code")
+
+
     df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -701,7 +646,7 @@ for page, entity in zip(pages, entities):
 
     print("Created category codes.")
 
-    for year in columns_years:
+    for year in columns_years :
         df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace(",", ".")
         df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace("NE1", "NE")
 
@@ -719,9 +664,9 @@ for page, entity in zip(pages, entities):
 
     df_trend_entity_long = df_trend_entity_long.reset_index()
 
-    if df_trend is None:
+    if df_trend is None :
         df_trend = df_trend_entity_long
-    else:
+    else :
         df_trend = pd.concat(
             [df_trend, df_trend_entity_long],
             axis=0,
@@ -742,127 +687,9 @@ df_trend_IF = pm2.pm2io.convert_long_dataframe_if(
     time_format="%Y",
 )
 
-# CH4 - values in main table are assumed to be correct
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "CH4"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "CH4"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "CH4"),
-    "2000",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "CH4"),
-    "2000",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "CH4"),
-    "2010",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "CH4"),
-    "2010",
-] = np.nan
-
-# N2O - values in main table are assumed to be correct
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "1.A.2") & (df_trend_IF["entity"] == "N2O"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "N2O"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "N2O"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "N2O"),
-    "2000",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "N2O"),
-    "2000",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "N2O"),
-    "2010",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "N2O"),
-    "2010",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "N2O"),
-    "2019",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK.A") & (df_trend_IF["entity"] == "N2O"),
-    "2019",
-] = np.nan
-
-# NOx - values in main table are assumed to be correct
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "NOx"),
-    "1990",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "NOx"),
-    "2000",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "M.BK") & (df_trend_IF["entity"] == "NOx"),
-    "2010",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "3.C") & (df_trend_IF["entity"] == "NOx"),
-    "2019",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "3.C.1") & (df_trend_IF["entity"] == "NOx"),
-    "2019",
-] = np.nan
-df_trend_IF.loc[
-    (df_trend_IF[category_column] == "3") & (df_trend_IF["entity"] == "NOx"),
-    "2019",
-] = np.nan
-
-# NMVOC - values in main table are assumed to be correct
-entity = "NMVOC"
-for category, year in [
-    ("1.A.2", "1990"),
-    ("M.BK", "1990"),
-    ("0", "2000"),
-    ("1", "2000"),
-    ("1.A", "2000"),
-    ("1.A.1", "2000"),
-    ("1.A.2", "2000"),
-    ("1.A.3", "2000"),
-    ("1.A.4", "2000"),
-    ("2", "2000"),
-    ("2.H", "2000"),
-    ("2.H.2", "2000"),
-    ("M.BK", "2000"),
-    ("0", "2010"),
-    ("1", "2010"),
-    ("1.A", "2010"),
-    ("1.A.1", "2010"),
-    ("1.A.2", "2010"),
-    ("1.A.3", "2010"),
-    ("1.A.4", "2010"),
-    ("2", "2010"),
-    ("M.BK", "2010"),
-    ("1.A.2", "2019"),
-]:
-    df_trend_IF.loc[
-        (df_trend_IF[category_column] == category) & (df_trend_IF["entity"] == entity),
-        year,
-    ] = np.nan
+df_trend_IF = find_and_replace_values(df=df_trend_IF,
+                                      replace_info=replace_info["trend"],
+                                      category_column=category_column)
 
 ### convert to primap2 format ###
 data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)
@@ -875,16 +702,20 @@ data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)
 # There are discrepancies larger than 0.86 for area category 1.A.2, entity NMVOC,
 # years 1990, 2000, 2010, 2019
 # It is assumed the main table has the correct values.
+print("Merging main and energy table.")
 data_pm2 = data_pm2_main.pr.merge(data_pm2_energy, tolerance=1)
 
 # merge afolu
+print("Merging afolu table.")
 data_pm2 = data_pm2.pr.merge(data_pm2_afolu, tolerance=0.11)
 
 # merge waste
 # increasing tolerance to merge values for 4.C, 1990, N2O - 0.003 in sector table, 0.0034 in main table
+print("Merging waste table.")
 data_pm2 = data_pm2.pr.merge(data_pm2_waste, tolerance=0.15)
 
 # merge trend
+print("Merging trend table.")
 data_pm2 = data_pm2.pr.merge(data_pm2_trend, tolerance=0.11)
 
 # convert back to IF to have units in the fixed format ( per year / per a / per annum)
@@ -899,13 +730,12 @@ pm2.pm2io.write_interchange_format(
     data_if,
 )
 
-encoding = {var: compression for var in data_pm2.data_vars}
+encoding = {var : compression for var in data_pm2.data_vars}
 data_pm2.pr.to_netcdf(
     output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
     encoding=encoding,
 )
 
-
 # ###
 # Processing
 # ###
@@ -917,12 +747,12 @@ processing_info_country = country_processing_step1
 data_country = data_pm2
 
 countries = list(data_country.coords[data_country.attrs["area"]].values)
-if len(countries) > 1:
+if len(countries) > 1 :
     raise ValueError(
         f"Found {len(countries)} countries. Only single country data "
         f"can be processed by this function. countries: {countries}"
     )
-else:
+else :
     country_code = countries[0]
 
 # get category terminology
@@ -932,7 +762,7 @@ cat_terminology_in = temp[0]
 
 # get scenario
 scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
-if len(scenarios) > 1:
+if len(scenarios) > 1 :
     raise ValueError(
         f"Found {len(scenarios)} scenarios. Only single scenario data "
         f"can be processed by this function. Scenarios: {scenarios}"
@@ -941,7 +771,7 @@ scenario = scenarios[0]
 
 # get source
 sources = list(data_country.coords["source"].values)
-if len(sources) > 1:
+if len(sources) > 1 :
     raise ValueError(
         f"Found {len(sources)} sources. Only single source data "
         f"can be processed by this function. Sources: {sources}"
@@ -949,9 +779,9 @@ if len(sources) > 1:
 source = sources[0]
 
 # check if category name column present
-if "orig_cat_name" in data_country.coords:
+if "orig_cat_name" in data_country.coords :
     cat_name_present = True
-else:
+else :
     cat_name_present = False
 
 # 1: general processing
@@ -977,38 +807,38 @@ print(
     f"Aggregating categories for country {country_code}, source {source}, "
     f"scenario {scenario}"
 )
-for cat_to_agg in aggregate_cats_current:
+for cat_to_agg in aggregate_cats_current :
     print(f"Category: {cat_to_agg}")
     source_cats = aggregate_cats_current[cat_to_agg]["sources"]
-    data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
+    data_agg = data_country.pr.loc[{"category" : source_cats}].pr.sum(
         dim="category", skipna=True, min_count=1
     )
     nan_vars = [
         var for var in data_agg.data_vars if data_agg[var].isnull().all().data is True
     ]
     data_agg = data_agg.drop(nan_vars)
-    if len(data_agg.data_vars) > 0:
+    if len(data_agg.data_vars) > 0 :
         data_agg = data_agg.expand_dims([f"category (" f"{cat_terminology_in})"])
         data_agg = data_agg.assign_coords(
             coords={
-                f"category ({cat_terminology_in})": (
+                f"category ({cat_terminology_in})" : (
                     f"category ({cat_terminology_in})",
                     [cat_to_agg],
                 )
             }
         )
-        if cat_name_present:
+        if cat_name_present :
             cat_name = aggregate_cats_current[cat_to_agg]["name"]
             data_agg = data_agg.assign_coords(
                 coords={
-                    "orig_cat_name": (
+                    "orig_cat_name" : (
                         f"category ({cat_terminology_in})",
                         [cat_name],
                     )
                 }
             )
         data_country = data_country.pr.merge(data_agg, tolerance=agg_tolerance)
-    else:
+    else :
         print(f"no data to aggregate category {cat_to_agg}")
 
 from UNFCCC_GHG_data.helper import GWP_factors
@@ -1017,9 +847,9 @@ from UNFCCC_GHG_data.helper import GWP_factors
 GWPs_to_add = country_processing_step1["basket_copy"]["GWPs_to_add"]
 entities = country_processing_step1["basket_copy"]["entities"]
 source_GWP = country_processing_step1["basket_copy"]["source_GWP"]
-for entity in entities:
+for entity in entities :
     data_source = data_country[f"{entity} ({source_GWP})"]
-    for GWP in GWPs_to_add:
+    for GWP in GWPs_to_add :
         data_GWP = data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
         data_GWP.attrs["entity"] = entity
         data_GWP.attrs["gwp_context"] = GWP
@@ -1027,27 +857,27 @@ for entity in entities:
 
 # create gas baskets
 entities_present = set(data_country.data_vars)
-for basket in gas_baskets.keys():
+for basket in gas_baskets.keys() :
     basket_contents_present = [
         gas for gas in gas_baskets[basket] if gas in entities_present
     ]
-    if len(basket_contents_present) > 0:
-        if basket in list(data_country.data_vars):
+    if len(basket_contents_present) > 0 :
+        if basket in list(data_country.data_vars) :
             data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
                 basket=basket,
                 basket_contents=basket_contents_present,
                 skipna=True,
                 min_count=1,
             )
-        else:
-            try:
+        else :
+            try :
                 # print(data_country.data_vars)
                 data_country[basket] = xr.full_like(
                     data_country["CO2"], np.nan
                 ).pr.quantify(units="Gg CO2 / year")
                 data_country[basket].attrs = {
-                    "entity": basket.split(" ")[0],
-                    "gwp_context": basket.split(" ")[1][1:-1],
+                    "entity" : basket.split(" ")[0],
+                    "gwp_context" : basket.split(" ")[1][1 :-1],
                 }
                 data_country[basket] = data_country.pr.gas_basket_contents_sum(
                     basket=basket,
@@ -1055,22 +885,20 @@ for basket in gas_baskets.keys():
                     min_count=1,
                 )
                 entities_present.add(basket)
-            except Exception as ex:
+            except Exception as ex :
                 print(
                     f"No gas basket created for {country_code}, {source}, "
                     f"{scenario}: {ex}"
                 )
 
-
 # amend title and comment
 data_country.attrs["comment"] = (
-    data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
+        data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
 )
 data_country.attrs["title"] = (
-    data_country.attrs["title"] + f" Processed on " f"{date.today()}"
+        data_country.attrs["title"] + f" Processed on " f"{date.today()}"
 )
 
-
 # ###
 # save processed data to IF and native format
 # ###
@@ -1080,13 +908,13 @@ terminology_proc = coords_terminologies["category"]
 
 data_proc_if = data_proc_pm2.pr.to_interchange_format()
 
-if not output_folder.exists():
+if not output_folder.exists() :
     output_folder.mkdir()
 pm2.pm2io.write_interchange_format(
     output_folder / (output_filename + terminology_proc), data_proc_if
 )
 
-encoding = {var: compression for var in data_proc_pm2.data_vars}
+encoding = {var : compression for var in data_proc_pm2.data_vars}
 data_proc_pm2.pr.to_netcdf(
     output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
 )

+ 57 - 7
UNFCCC_GHG_data/helper/functions.py

@@ -79,7 +79,7 @@ def process_data_for_country(
     # remove unused cats
     data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
     # remove unused years
-    data_country = data_country.dropna(f"time", how="all")
+    data_country = data_country.dropna("time", how="all")
     # remove variables only containing nan
     nan_vars_country = [
         var
@@ -431,7 +431,7 @@ def convert_categories(
             nan_vars = [
                 var
                 for var in data_agg.data_vars
-                if data_agg[var].isnull().all().data == True
+                if data_agg[var].isnull().all().data is True
             ]
             data_agg = data_agg.drop(nan_vars)
             if len(data_agg.data_vars) > 0:
@@ -625,7 +625,7 @@ def get_country_submissions(
 
     country_submissions = {}
     if print_sub:
-        print(f"#" * 80)
+        print("#" * 80)
         print(f"The following submissions are available for {country_name}")
     for item in data_folder.iterdir():
         if item.is_dir():
@@ -697,7 +697,7 @@ def get_country_datasets(
     rep_data = {}
     # data
     if print_ds:
-        print(f"#" * 80)
+        print("#" * 80)
         print(f"The following datasets are available for {country_name}")
     for item in data_folder.iterdir():
         if item.is_dir():
@@ -757,7 +757,7 @@ def get_country_datasets(
                         if code_file:
                             data_info = data_info + f"code: {code_file.name}"
                         else:
-                            data_info = data_info + f"code: not found"
+                            data_info = data_info + "code: not found"
 
                         cleaned_datasets_current_folder[key] = data_info
 
@@ -775,7 +775,7 @@ def get_country_datasets(
 
     # legacy data
     if print_ds:
-        print(f"#" * 80)
+        print("#" * 80)
         print(f"The following legacy datasets are available for {country_name}")
     legacy_data = {}
     for item in data_folder_legacy.iterdir():
@@ -972,8 +972,58 @@ def fix_rows(
         new_row = new_row.str.replace("- ", "-")
         # replace spaces in numbers
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
-        repl = lambda m: f"{m.group('first')}{m.group('last')}"
+        def repl(m):
+            return f"{m.group('first')}{m.group('last')}"
         new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
     return data
+
+
+def find_and_replace_values(df: pd.DataFrame,
+                            replace_info : list[tuple[str | float]],
+                            category_column : str,
+                            entity_column : str ='entity',
+                            ) -> pd.DataFrame:
+    """
+    Find values and replace single values in a dataframe.
+    
+    Input
+    -----
+    df
+        Input data frame
+    replace_info
+        Category, entity, year, and new value. Don't put a new value if you would like to replace with nan.
+        For example [("3.C", "CO", "2019", 3.423)] or [("3.C", "CO", "2019")]
+    category_column
+        The name of the column that contains the categories.
+    entity_column
+        The name of the column that contains the categories.
+        
+    Output
+    ------
+        Data frame with updated values.
+        
+    """
+    for replace_info_value in replace_info:
+        
+        category = replace_info_value[0]
+        entity = replace_info_value[1]
+        year = replace_info_value[2]
+
+        if len(replace_info_value) == 4:
+            new_value = replace_info_value[3]
+        elif len(replace_info_value) == 3:
+            new_value = np.nan
+        else:
+            raise AssertionError(f'Expected tuple of length 3 or 4. Got {replace_info_value}')
+
+        index = df.loc[
+            (df[category_column] == category) & (df[entity_column] == entity),
+        ].index[0]
+        
+        # pandas recommends using .at[] for changing single values
+        df.at[index, year] = new_value
+        print(f"Set value for {category}, {entity}, {year} to {new_value}.")
+
+    return df