Browse Source

restore functions.py

Daniel Busch 11 months ago
parent
commit
dfd5b019fc

+ 64 - 68
UNFCCC_GHG_data/UNFCCC_reader/Guinea/read_GIN_BUR1_from_pdf.py

@@ -10,7 +10,8 @@ import primap2 as pm2
 import pandas as pd
 
 from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
-from UNFCCC_GHG_data.helper.functions import find_and_replace_values, process_data_for_country
+from UNFCCC_GHG_data.helper.functions import process_data_for_country
+from UNFCCC_GHG_data.helper.functions_temp import find_and_replace_values
 from config_GIN_BUR1 import coords_cols, coords_defaults, coords_terminologies
 from config_GIN_BUR1 import (
     coords_value_mapping,
@@ -18,7 +19,13 @@ from config_GIN_BUR1 import (
     meta_data,
     page_def_templates,
 )
-from config_GIN_BUR1 import inv_conf, country_processing_step1, gas_baskets, replace_info, replace_categories
+from config_GIN_BUR1 import (
+    inv_conf,
+    country_processing_step1,
+    gas_baskets,
+    replace_info,
+    replace_categories,
+)
 
 # ###
 # configuration
@@ -26,7 +33,7 @@ from config_GIN_BUR1 import inv_conf, country_processing_step1, gas_baskets, rep
 
 input_folder = downloaded_data_path / "UNFCCC" / "Guinea" / "BUR1"
 output_folder = extracted_data_path / "UNFCCC" / "Guinea"
-if not output_folder.exists() :
+if not output_folder.exists():
     output_folder.mkdir()
 
 pdf_file = "Rapport_IGES-Guinee-BUR1_VF.pdf"
@@ -40,7 +47,7 @@ compression = dict(zlib=True, complevel=9)
 
 pages = ["110", "111", "112", "113"]
 df_main = None
-for page in pages :
+for page in pages:
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -58,7 +65,7 @@ for page in pages :
     df_inventory = tables_inventory_original[0].df.copy()
 
     # move broken text in correct row (page 113 is fine)
-    if page in ["110", "111", "112"] :
+    if page in ["110", "111", "112"]:
         df_inventory.at[4, 0] = "1.A.1 - Industries énergétiques"
         df_inventory = df_inventory.drop(index=3)
         df_inventory.at[8, 0] = "1.A.4 - Autres secteurs"
@@ -100,12 +107,10 @@ for page in pages :
 
     df_inventory_long["category"] = df_inventory_long["category"].str.replace(".", "")
 
-
     # regex replacements
-    def repl(m) :
+    def repl(m):
         return m.group("code")
 
-
     df_inventory_long["category"] = df_inventory_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -119,9 +124,9 @@ for page in pages :
     df_inventory_long.columns = df_inventory_long.columns.map(str)
     df_inventory_long = df_inventory_long.drop(columns=["orig_cat_name"])
 
-    if df_main is None :
+    if df_main is None:
         df_main = df_inventory_long
-    else :
+    else:
         df_main = pd.concat(
             [df_main, df_inventory_long],
             axis=0,
@@ -141,10 +146,9 @@ df_all_IF = pm2.pm2io.convert_long_dataframe_if(
     time_format="%Y",
 )
 
-df_all_IF = find_and_replace_values(df=df_all_IF,
-                                    replace_info=replace_info['main'],
-                                    category_column=category_column
-                                    )
+df_all_IF = find_and_replace_values(
+    df=df_all_IF, replace_info=replace_info["main"], category_column=category_column
+)
 
 ### convert to primap2 format ###
 data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
@@ -155,7 +159,7 @@ data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
 
 pages = ["116", "117", "118", "119"]
 df_energy = None
-for page in pages :
+for page in pages:
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -167,7 +171,7 @@ for page in pages :
 
     # cut last two lines of second table to ignore additional information regarding biomass for energy production
     df_energy_year = pd.concat(
-        [tables_inventory_original[0].df[2 :], tables_inventory_original[1].df[3 :-2]],
+        [tables_inventory_original[0].df[2:], tables_inventory_original[1].df[3:-2]],
         axis=0,
         join="outer",
     ).reset_index(drop=True)
@@ -175,19 +179,19 @@ for page in pages :
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.3.a.i - Aviation internationale (Soutes internationales)"
-        ][0]
+    ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.3.d.i - Navigation internationale (soutes internationales)"
-        ][0]
+    ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     row_to_delete = df_energy_year.index[
         df_energy_year[0]
         == "1.A.5.c - Opérations multilatérales (Éléments pour information)"
-        ][0]
+    ][0]
     df_energy_year = df_energy_year.drop(index=row_to_delete)
 
     # add header and unit
@@ -235,12 +239,10 @@ for page in pages :
         ".", ""
     )
 
-
     # then the regex replacements
-    def repl(m) :
+    def repl(m):
         return m.group("code")
 
-
     df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -254,9 +256,9 @@ for page in pages :
     df_energy_year_long.columns = df_energy_year_long.columns.map(str)
     df_energy_year_long = df_energy_year_long.drop(columns=["orig_cat_name"])
 
-    if df_energy is None :
+    if df_energy is None:
         df_energy = df_energy_year_long
-    else :
+    else:
         df_energy = pd.concat(
             [df_energy, df_energy_year_long],
             axis=0,
@@ -285,7 +287,7 @@ data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)
 
 pages = ["124", "125", "126", "127"]
 df_afolu = None
-for page in pages :
+for page in pages:
     print("-" * 45)
     print(f"Reading table from page {page}.")
 
@@ -294,19 +296,19 @@ for page in pages :
     )
     print("Reading complete.")
 
-    if page == "127" :
+    if page == "127":
         # table on page 127 has one extra row at the top
         # and one extra category 3.A.1.j
-        df_afolu_year = tables_inventory_original[0].df[3 :]
+        df_afolu_year = tables_inventory_original[0].df[3:]
         # 3.A.1.a.i to 3.A.1.j exist twice.
         # Rename duplicate categories in tables.
-        for index, category_name in replace_categories['afolu']['127'] :
+        for index, category_name in replace_categories["afolu"]["127"]:
             df_afolu_year.at[index, 0] = category_name
-    else :
+    else:
         # cut first two lines
-        df_afolu_year = tables_inventory_original[0].df[2 :]
+        df_afolu_year = tables_inventory_original[0].df[2:]
         # On pages 124-126 the wrong categories are slightly different
-        for index, category_name in replace_categories['afolu']['124-126'] :
+        for index, category_name in replace_categories["afolu"]["124-126"]:
             df_afolu_year.at[index, 0] = category_name
 
     # add header and unit
@@ -343,12 +345,10 @@ for page in pages :
     # make a copy of the categories row
     df_afolu_year_long["category"] = df_afolu_year_long["orig_cat_name"]
 
-
     # regex replacements
-    def repl(m) :
+    def repl(m):
         return m.group("code")
 
-
     df_afolu_year_long["category"] = df_afolu_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -362,9 +362,9 @@ for page in pages :
     df_afolu_year_long.columns = df_afolu_year_long.columns.map(str)
     df_afolu_year_long = df_afolu_year_long.drop(columns=["orig_cat_name"])
 
-    if df_afolu is None :
+    if df_afolu is None:
         df_afolu = df_afolu_year_long
-    else :
+    else:
         df_afolu = pd.concat(
             [df_afolu, df_afolu_year_long],
             axis=0,
@@ -408,18 +408,18 @@ tables_inventory_original_130 = camelot.read_pdf(
 
 # save to dict
 df_waste_years = {
-    "1990" : tables_inventory_original_128[0].df,
-    "2000" : tables_inventory_original_128[1].df,
-    "2010" : tables_inventory_original_128[2].df,
-    "2019" : tables_inventory_original_130[0].df,
+    "1990": tables_inventory_original_128[0].df,
+    "2000": tables_inventory_original_128[1].df,
+    "2010": tables_inventory_original_128[2].df,
+    "2019": tables_inventory_original_130[0].df,
 }
 
 df_waste = None
-for year in df_waste_years.keys() :
+for year in df_waste_years.keys():
     print("-" * 45)
     print(f"Processing table for {year}.")
 
-    df_waste_year = df_waste_years[year][2 :]
+    df_waste_year = df_waste_years[year][2:]
 
     # add header and unit
     df_header = pd.DataFrame([inv_conf["header_waste"], inv_conf["unit_waste"]])
@@ -453,12 +453,10 @@ for year in df_waste_years.keys() :
     # make a copy of the categories row
     df_waste_year_long["category"] = df_waste_year_long["orig_cat_name"]
 
-
     # regex replacements
-    def repl(m) :
+    def repl(m):
         return m.group("code")
 
-
     df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -473,9 +471,9 @@ for year in df_waste_years.keys() :
     df_waste_year_long.columns = df_waste_year_long.columns.map(str)
     df_waste_year_long = df_waste_year_long.drop(columns=["orig_cat_name"])
 
-    if df_waste is None :
+    if df_waste is None:
         df_waste = df_waste_year_long
-    else :
+    else:
         df_waste = pd.concat(
             [df_waste, df_waste_year_long],
             axis=0,
@@ -507,7 +505,7 @@ pages = ["131", "132", "133", "134", "135", "136", "137"]
 entities = ["CO2", "CH4", "N2O", "NOx", "CO", "NMVOCs", "SO2"]
 
 # for this set of tables every page is a different entity
-for page, entity in zip(pages, entities) :
+for page, entity in zip(pages, entities):
     # The table for CO seems completely mixed up and should not be considered.
     # The total CO values for 1990 equal the values in the main table.
     # The total CO values for 1995 equal the values for 2000 in the main table.
@@ -516,7 +514,7 @@ for page, entity in zip(pages, entities) :
     # The total CO values for 2010 are identical to the 1990 values in the same table.
     # The total CO values for 2019 are identical to the 1995 values in the same table.
     # And so on.
-    if entity == "CO" :
+    if entity == "CO":
         continue
 
     print("-" * 45)
@@ -527,7 +525,7 @@ for page, entity in zip(pages, entities) :
     # see https://github.com/atlanhq/camelot/issues/306,
     # or because characters in first row almost touch
     # the table grid.
-    if page == "131" :
+    if page == "131":
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file),
             pages=page,
@@ -537,7 +535,7 @@ for page, entity in zip(pages, entities) :
             split_text=True,
         )
 
-        df_trend_entity = tables_inventory_original[0].df[1 :]
+        df_trend_entity = tables_inventory_original[0].df[1:]
 
         # The categories 3.D / 3.D.1 / 3.D.2 contain values different to the main table
         # They should also not contain negative values according to IPCC methodology:
@@ -548,19 +546,19 @@ for page, entity in zip(pages, entities) :
 
         row_to_delete = df_trend_entity.index[
             df_trend_entity[0] == "3.D.1 - Produits ligneux récoltés"
-            ][0]
+        ][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
         row_to_delete = df_trend_entity.index[
             df_trend_entity[0] == "3.D.2 - Autres (veuillez spécifier)"
-            ][0]
+        ][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
-    else :
+    else:
         tables_inventory_original = camelot.read_pdf(
             str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
         )
-        df_trend_entity = tables_inventory_original[0].df[3 :]
+        df_trend_entity = tables_inventory_original[0].df[3:]
 
     print("Reading complete.")
 
@@ -589,7 +587,7 @@ for page, entity in zip(pages, entities) :
     df_trend_entity.loc[:, "category"] = df_trend_entity["orig_cat_name"]
 
     # Delete empty line for pages 132-137.
-    if page != "131" :
+    if page != "131":
         row_to_delete = df_trend_entity.index[df_trend_entity["category"] == ""][0]
         df_trend_entity = df_trend_entity.drop(index=row_to_delete)
 
@@ -604,11 +602,9 @@ for page, entity in zip(pages, entities) :
         "\n", ""
     )
 
-
-    def repl(m) :
+    def repl(m):
         return m.group("code")
 
-
     df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
         inv_conf["cat_code_regexp"], repl, regex=True
     )
@@ -617,7 +613,7 @@ for page, entity in zip(pages, entities) :
 
     print("Created category codes.")
 
-    for year in columns_years :
+    for year in columns_years:
         df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace(",", ".")
         df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace("NE1", "NE")
 
@@ -635,9 +631,9 @@ for page, entity in zip(pages, entities) :
 
     df_trend_entity_long = df_trend_entity_long.reset_index()
 
-    if df_trend is None :
+    if df_trend is None:
         df_trend = df_trend_entity_long
-    else :
+    else:
         df_trend = pd.concat(
             [df_trend, df_trend_entity_long],
             axis=0,
@@ -658,9 +654,9 @@ df_trend_IF = pm2.pm2io.convert_long_dataframe_if(
     time_format="%Y",
 )
 
-df_trend_IF = find_and_replace_values(df=df_trend_IF,
-                                      replace_info=replace_info["trend"],
-                                      category_column=category_column)
+df_trend_IF = find_and_replace_values(
+    df=df_trend_IF, replace_info=replace_info["trend"], category_column=category_column
+)
 
 ### convert to primap2 format ###
 data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)
@@ -701,7 +697,7 @@ pm2.pm2io.write_interchange_format(
     data_if,
 )
 
-encoding = {var : compression for var in data_pm2.data_vars}
+encoding = {var: compression for var in data_pm2.data_vars}
 data_pm2.pr.to_netcdf(
     output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
     encoding=encoding,
@@ -731,13 +727,13 @@ terminology_proc = coords_terminologies["category"]
 
 data_proc_if = data_proc_pm2.pr.to_interchange_format()
 
-if not output_folder.exists() :
+if not output_folder.exists():
     output_folder.mkdir()
 pm2.pm2io.write_interchange_format(
     output_folder / (output_filename + terminology_proc), data_proc_if
 )
 
-encoding = {var : compression for var in data_proc_pm2.data_vars}
+encoding = {var: compression for var in data_proc_pm2.data_vars}
 data_proc_pm2.pr.to_netcdf(
     output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
 )

+ 7 - 57
UNFCCC_GHG_data/helper/functions.py

@@ -79,7 +79,7 @@ def process_data_for_country(
     # remove unused cats
     data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
     # remove unused years
-    data_country = data_country.dropna("time", how="all")
+    data_country = data_country.dropna(f"time", how="all")
     # remove variables only containing nan
     nan_vars_country = [
         var
@@ -431,7 +431,7 @@ def convert_categories(
             nan_vars = [
                 var
                 for var in data_agg.data_vars
-                if data_agg[var].isnull().all().data is True
+                if data_agg[var].isnull().all().data == True
             ]
             data_agg = data_agg.drop(nan_vars)
             if len(data_agg.data_vars) > 0:
@@ -625,7 +625,7 @@ def get_country_submissions(
 
     country_submissions = {}
     if print_sub:
-        print("#" * 80)
+        print(f"#" * 80)
         print(f"The following submissions are available for {country_name}")
     for item in data_folder.iterdir():
         if item.is_dir():
@@ -697,7 +697,7 @@ def get_country_datasets(
     rep_data = {}
     # data
     if print_ds:
-        print("#" * 80)
+        print(f"#" * 80)
         print(f"The following datasets are available for {country_name}")
     for item in data_folder.iterdir():
         if item.is_dir():
@@ -757,7 +757,7 @@ def get_country_datasets(
                         if code_file:
                             data_info = data_info + f"code: {code_file.name}"
                         else:
-                            data_info = data_info + "code: not found"
+                            data_info = data_info + f"code: not found"
 
                         cleaned_datasets_current_folder[key] = data_info
 
@@ -775,7 +775,7 @@ def get_country_datasets(
 
     # legacy data
     if print_ds:
-        print("#" * 80)
+        print(f"#" * 80)
         print(f"The following legacy datasets are available for {country_name}")
     legacy_data = {}
     for item in data_folder_legacy.iterdir():
@@ -972,58 +972,8 @@ def fix_rows(
         new_row = new_row.str.replace("- ", "-")
         # replace spaces in numbers
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
-        def repl(m):
-            return f"{m.group('first')}{m.group('last')}"
+        repl = lambda m: f"{m.group('first')}{m.group('last')}"
         new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
     return data
-
-
-def find_and_replace_values(df: pd.DataFrame,
-                            replace_info : list[tuple[str | float]],
-                            category_column : str,
-                            entity_column : str ='entity',
-                            ) -> pd.DataFrame:
-    """
-    Find values and replace single values in a dataframe.
-    
-    Input
-    -----
-    df
-        Input data frame
-    replace_info
-        Category, entity, year, and new value. Don't put a new value if you would like to replace with nan.
-        For example [("3.C", "CO", "2019", 3.423)] or [("3.C", "CO", "2019")]
-    category_column
-        The name of the column that contains the categories.
-    entity_column
-        The name of the column that contains the categories.
-        
-    Output
-    ------
-        Data frame with updated values.
-        
-    """
-    for replace_info_value in replace_info:
-        
-        category = replace_info_value[0]
-        entity = replace_info_value[1]
-        year = replace_info_value[2]
-
-        if len(replace_info_value) == 4:
-            new_value = replace_info_value[3]
-        elif len(replace_info_value) == 3:
-            new_value = np.nan
-        else:
-            raise AssertionError(f'Expected tuple of length 3 or 4. Got {replace_info_value}')
-
-        index = df.loc[
-            (df[category_column] == category) & (df[entity_column] == entity),
-        ].index[0]
-        
-        # pandas recommends using .at[] for changing single values
-        df.at[index, year] = new_value
-        print(f"Set value for {category}, {entity}, {year} to {new_value}.")
-
-    return df

+ 0 - 0
UNFCCC_GHG_data/helper/functions_temp.py