|
@@ -16,8 +16,14 @@ from config_mng_bur2 import ( # noqa: E402
|
|
|
coords_defaults,
|
|
|
coords_terminologies,
|
|
|
coords_value_mapping,
|
|
|
+ country_processing_step1,
|
|
|
+ filter_remove,
|
|
|
+ gas_baskets,
|
|
|
+ inv_conf,
|
|
|
inv_conf_harvested_wood_products,
|
|
|
+ inv_conf_per_entity,
|
|
|
inv_conf_per_sector,
|
|
|
+ inv_conf_per_year,
|
|
|
meta_data,
|
|
|
)
|
|
|
|
|
@@ -25,9 +31,12 @@ from unfccc_ghg_data.helper import ( # noqa: E402
|
|
|
downloaded_data_path,
|
|
|
extracted_data_path,
|
|
|
fix_rows,
|
|
|
+ process_data_for_country,
|
|
|
)
|
|
|
|
|
|
-if __name__ == "__main__" :
|
|
|
+# pd.options.mode.chained_assignment = None # default='warn'
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
# ###
|
|
|
# configuration
|
|
|
# ###
|
|
@@ -35,7 +44,7 @@ if __name__ == "__main__" :
|
|
|
input_folder = downloaded_data_path / "UNFCCC" / "Mongolia" / "BUR2"
|
|
|
output_folder = extracted_data_path / "UNFCCC" / "Mongolia"
|
|
|
|
|
|
- if not output_folder.exists() :
|
|
|
+ if not output_folder.exists():
|
|
|
output_folder.mkdir()
|
|
|
|
|
|
pdf_file = "20231112_NIR_MGL.pdf"
|
|
@@ -43,294 +52,301 @@ if __name__ == "__main__" :
|
|
|
category_column = f"category ({coords_terminologies['category']})"
|
|
|
compression = dict(zlib=True, complevel=9)
|
|
|
|
|
|
-
|
|
|
- def repl(m) : # noqa: D103
|
|
|
+ def repl(m): # noqa: D103
|
|
|
return m.group("code")
|
|
|
|
|
|
-
|
|
|
# ###
|
|
|
# 1. Read in main tables
|
|
|
# ###
|
|
|
|
|
|
- # df_main = None
|
|
|
- # for year in inv_conf_per_year.keys():
|
|
|
- # print("-" * 60)
|
|
|
- # print(f"Reading year {year}.")
|
|
|
- # print("-" * 60)
|
|
|
- # df_year = None
|
|
|
- # for page in inv_conf_per_year[year]["page_defs"].keys():
|
|
|
- # print(f"Reading table from page {page}.")
|
|
|
- # tables_inventory_original = camelot.read_pdf(
|
|
|
- # str(input_folder / pdf_file),
|
|
|
- # pages=page,
|
|
|
- # table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
|
|
|
- # columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
|
|
|
- # flavor="stream",
|
|
|
- # split_text=True,
|
|
|
- # )
|
|
|
- # print("Reading complete.")
|
|
|
- #
|
|
|
- # df_page = tables_inventory_original[0].df
|
|
|
- #
|
|
|
- # if df_year is None:
|
|
|
- # df_year = df_page
|
|
|
- # else:
|
|
|
- # df_year = pd.concat(
|
|
|
- # [df_year, df_page],
|
|
|
- # axis=0,
|
|
|
- # join="outer",
|
|
|
- # ).reset_index(drop=True)
|
|
|
- #
|
|
|
- # print(f"Concatenating all tables for {year}.")
|
|
|
- #
|
|
|
- # # fix content that spreads across multiple rows
|
|
|
- # if "rows_to_fix" in inv_conf_per_year[year]:
|
|
|
- # for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
|
|
|
- # print(f"Merge content for {n_rows=}")
|
|
|
- # df_year = fix_rows(
|
|
|
- # df_year,
|
|
|
- # rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
|
|
|
- # col_to_use=0,
|
|
|
- # n_rows=n_rows,
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
|
|
|
- #
|
|
|
- # skip_rows = 11
|
|
|
- # df_year = pd.concat(
|
|
|
- # [df_header, df_year[skip_rows:]], axis=0, join="outer"
|
|
|
- # ).reset_index(drop=True)
|
|
|
- #
|
|
|
- # df_year = pm2.pm2io.nir_add_unit_information(
|
|
|
- # df_year,
|
|
|
- # unit_row=inv_conf["unit_row"],
|
|
|
- # entity_row=inv_conf["entity_row"],
|
|
|
- # regexp_entity=".*",
|
|
|
- # regexp_unit=".*",
|
|
|
- # default_unit="Gg",
|
|
|
- # )
|
|
|
- #
|
|
|
- # print("Added unit information.")
|
|
|
- #
|
|
|
- # # set index
|
|
|
- # df_year = df_year.set_index(inv_conf["index_cols"])
|
|
|
- #
|
|
|
- # # convert to long format
|
|
|
- # df_year_long = pm2.pm2io.nir_convert_df_to_long(
|
|
|
- # df_year, year, inv_conf["header_long"]
|
|
|
- # )
|
|
|
- #
|
|
|
- # # extract from tuple
|
|
|
- # df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
|
|
|
- #
|
|
|
- # # prep for conversion to PM2 IF and native format
|
|
|
- # # make a copy of the categories row
|
|
|
- # df_year_long["category"] = df_year_long["orig_cat_name"]
|
|
|
- #
|
|
|
- # # replace cat names by codes in col "category"
|
|
|
- # # first the manual replacements
|
|
|
- #
|
|
|
- # df_year_long["category"] = df_year_long["category"].replace(
|
|
|
- # inv_conf["cat_codes_manual"]
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_year_long["category"] = df_year_long["category"].str.replace(".", "")
|
|
|
- #
|
|
|
- # # then the regex replacements
|
|
|
- # df_year_long["category"] = df_year_long["category"].str.replace(
|
|
|
- # inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_year_long = df_year_long.reset_index(drop=True)
|
|
|
- #
|
|
|
- # df_year_long["data"] = df_year_long["data"].str.replace(",", "")
|
|
|
- #
|
|
|
- # # make sure all col headers are str
|
|
|
- # df_year_long.columns = df_year_long.columns.map(str)
|
|
|
- #
|
|
|
- # df_year_long = df_year_long.drop(columns=["orig_cat_name"])
|
|
|
- #
|
|
|
- # if df_main is None:
|
|
|
- # df_main = df_year_long
|
|
|
- # else:
|
|
|
- # df_main = pd.concat(
|
|
|
- # [df_main, df_year_long],
|
|
|
- # axis=0,
|
|
|
- # join="outer",
|
|
|
- # ).reset_index(drop=True)
|
|
|
- #
|
|
|
- # ### convert to interchange format ###
|
|
|
- # print("Converting to interchange format.")
|
|
|
- # df_main_IF = pm2.pm2io.convert_long_dataframe_if(
|
|
|
- # df_main,
|
|
|
- # coords_cols=coords_cols,
|
|
|
- # coords_defaults=coords_defaults,
|
|
|
- # coords_terminologies=coords_terminologies,
|
|
|
- # coords_value_mapping=coords_value_mapping,
|
|
|
- # filter_remove=filter_remove,
|
|
|
- # meta_data=meta_data,
|
|
|
- # convert_str=True,
|
|
|
- # time_format="%Y",
|
|
|
- # )
|
|
|
- #
|
|
|
- # ### convert to primap2 format ###
|
|
|
- # print("Converting to primap2 format.")
|
|
|
- # data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
|
|
|
- #
|
|
|
- # # ###
|
|
|
- # # 2. Read in trend tables
|
|
|
- # # ###
|
|
|
- #
|
|
|
- # df_trend = None
|
|
|
- # for entity in inv_conf_per_entity.keys():
|
|
|
- # print("-" * 60)
|
|
|
- # print(f"Reading entity {entity}.")
|
|
|
- #
|
|
|
- # df_entity = None
|
|
|
- #
|
|
|
- # for page in inv_conf_per_entity[entity]["page_defs"].keys():
|
|
|
- # print(f"Reading page {page}.")
|
|
|
- #
|
|
|
- # tables_inventory_original = camelot.read_pdf(
|
|
|
- # str(input_folder / pdf_file),
|
|
|
- # pages=page,
|
|
|
- # table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
|
|
|
- # columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
|
|
|
- # flavor="stream",
|
|
|
- # split_text=True,
|
|
|
- # )
|
|
|
- # df_page = tables_inventory_original[0].df
|
|
|
- #
|
|
|
- # if df_entity is None:
|
|
|
- # df_entity = df_page
|
|
|
- # else:
|
|
|
- # df_entity = pd.concat(
|
|
|
- # [df_entity, df_page],
|
|
|
- # axis=0,
|
|
|
- # join="outer",
|
|
|
- # ).reset_index(drop=True)
|
|
|
- # print(f"adding table from page {page}.")
|
|
|
- #
|
|
|
- # if "rows_to_fix" in inv_conf_per_entity[entity]:
|
|
|
- # for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
|
|
|
- # print(f"Merge content for {n_rows=}")
|
|
|
- # df_entity = fix_rows(
|
|
|
- # df_entity,
|
|
|
- # rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
|
|
|
- # col_to_use=0,
|
|
|
- # n_rows=n_rows,
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_entity.columns = df_entity.iloc[0, :]
|
|
|
- # df_entity = df_entity[1:]
|
|
|
- #
|
|
|
- # # unit is always Gg
|
|
|
- # df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
|
|
|
- #
|
|
|
- # # only one entity per table
|
|
|
- # df_entity.loc[:, "entity"] = entity
|
|
|
- #
|
|
|
- # # TODO: Fix pandas "set value on slice of copy" warning
|
|
|
- # df_entity.loc[:, "category"] = df_entity.loc[
|
|
|
- # :, inv_conf_per_entity[entity]["category_column"]
|
|
|
- # ]
|
|
|
- #
|
|
|
- # if "rows_to_drop" in inv_conf_per_entity[entity]:
|
|
|
- # for row in inv_conf_per_entity[entity]["rows_to_drop"]:
|
|
|
- # row_to_delete = df_entity.index[df_entity["category"] == row][0]
|
|
|
- # df_entity = df_entity.drop(index=row_to_delete)
|
|
|
- #
|
|
|
- # df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
|
|
|
- # inv_conf_per_entity[entity]["cat_codes_manual"]
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_entity.loc[:, "category"] = df_entity["category"].str.replace(
|
|
|
- # inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
- # )
|
|
|
- #
|
|
|
- # df_entity = df_entity.drop(
|
|
|
- # columns=inv_conf_per_entity[entity]["columns_to_drop"]
|
|
|
- # )
|
|
|
- #
|
|
|
- # for year in inv_conf_per_entity[entity]["years"]:
|
|
|
- # df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
|
|
|
- #
|
|
|
- # if df_trend is None:
|
|
|
- # df_trend = df_entity
|
|
|
- # else:
|
|
|
- # df_trend = pd.concat(
|
|
|
- # [df_trend, df_entity],
|
|
|
- # axis=0,
|
|
|
- # join="outer",
|
|
|
- # ).reset_index(drop=True)
|
|
|
- #
|
|
|
- # ### convert to interchange format ###
|
|
|
- # df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
- # data_wide=df_trend,
|
|
|
- # coords_cols=coords_cols,
|
|
|
- # coords_defaults=coords_defaults,
|
|
|
- # coords_terminologies=coords_terminologies,
|
|
|
- # coords_value_mapping=coords_value_mapping,
|
|
|
- # # filter_remove=filter_remove,
|
|
|
- # meta_data=meta_data,
|
|
|
- # convert_str=True,
|
|
|
- # time_format="%Y",
|
|
|
- # )
|
|
|
- #
|
|
|
- # ### convert to primap2 format ###
|
|
|
- # print("Converting to primap2 format.")
|
|
|
- # data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
|
|
|
+ df_main = None
|
|
|
+ for year in inv_conf_per_year.keys():
|
|
|
+ print("-" * 60)
|
|
|
+ print(f"Reading year {year}.")
|
|
|
+ print("-" * 60)
|
|
|
+ df_year = None
|
|
|
+ for page in inv_conf_per_year[year]["page_defs"].keys():
|
|
|
+ print(f"Reading table from page {page}.")
|
|
|
+ tables_inventory_original = camelot.read_pdf(
|
|
|
+ str(input_folder / pdf_file),
|
|
|
+ pages=page,
|
|
|
+ table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
|
|
|
+ columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
|
|
|
+ flavor="stream",
|
|
|
+ split_text=True,
|
|
|
+ )
|
|
|
+ print("Reading complete.")
|
|
|
+
|
|
|
+ df_page = tables_inventory_original[0].df
|
|
|
+
|
|
|
+ if df_year is None:
|
|
|
+ df_year = df_page
|
|
|
+ else:
|
|
|
+ df_year = pd.concat(
|
|
|
+ [df_year, df_page],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ print(f"Concatenating all tables for {year}.")
|
|
|
+
|
|
|
+ # fix content that spreads across multiple rows
|
|
|
+ if "rows_to_fix" in inv_conf_per_year[year]:
|
|
|
+ for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
|
|
|
+ print(f"Merge content for {n_rows=}")
|
|
|
+ df_year = fix_rows(
|
|
|
+ df_year,
|
|
|
+ rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
|
|
|
+ col_to_use=0,
|
|
|
+ n_rows=n_rows,
|
|
|
+ )
|
|
|
+
|
|
|
+ df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
|
|
|
+
|
|
|
+ skip_rows = 11
|
|
|
+ df_year = pd.concat(
|
|
|
+ [df_header, df_year[skip_rows:]], axis=0, join="outer"
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ df_year = pm2.pm2io.nir_add_unit_information(
|
|
|
+ df_year,
|
|
|
+ unit_row=inv_conf["unit_row"],
|
|
|
+ entity_row=inv_conf["entity_row"],
|
|
|
+ regexp_entity=".*",
|
|
|
+ regexp_unit=".*",
|
|
|
+ default_unit="Gg",
|
|
|
+ )
|
|
|
+
|
|
|
+ print("Added unit information.")
|
|
|
+
|
|
|
+ # set index
|
|
|
+ df_year = df_year.set_index(inv_conf["index_cols"])
|
|
|
+
|
|
|
+ # convert to long format
|
|
|
+ df_year_long = pm2.pm2io.nir_convert_df_to_long(
|
|
|
+ df_year, year, inv_conf["header_long"]
|
|
|
+ )
|
|
|
+
|
|
|
+ # extract from tuple
|
|
|
+ df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
|
|
|
+
|
|
|
+ # prep for conversion to PM2 IF and native format
|
|
|
+ # make a copy of the categories row
|
|
|
+ df_year_long["category"] = df_year_long["orig_cat_name"]
|
|
|
+
|
|
|
+ # replace cat names by codes in col "category"
|
|
|
+ # first the manual replacements
|
|
|
+
|
|
|
+ df_year_long["category"] = df_year_long["category"].replace(
|
|
|
+ inv_conf["cat_codes_manual"]
|
|
|
+ )
|
|
|
+
|
|
|
+ df_year_long["category"] = df_year_long["category"].str.replace(".", "")
|
|
|
+
|
|
|
+ # then the regex replacements
|
|
|
+ df_year_long["category"] = df_year_long["category"].str.replace(
|
|
|
+ inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
+ )
|
|
|
+
|
|
|
+ df_year_long = df_year_long.reset_index(drop=True)
|
|
|
+
|
|
|
+ df_year_long["data"] = df_year_long["data"].str.replace(",", "")
|
|
|
+
|
|
|
+ # make sure all col headers are str
|
|
|
+ df_year_long.columns = df_year_long.columns.map(str)
|
|
|
+
|
|
|
+ df_year_long = df_year_long.drop(columns=["orig_cat_name"])
|
|
|
+
|
|
|
+ if df_main is None:
|
|
|
+ df_main = df_year_long
|
|
|
+ else:
|
|
|
+ df_main = pd.concat(
|
|
|
+ [df_main, df_year_long],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ ### convert to interchange format ###
|
|
|
+ print("Converting to interchange format.")
|
|
|
+ df_main_IF = pm2.pm2io.convert_long_dataframe_if(
|
|
|
+ df_main,
|
|
|
+ coords_cols=coords_cols,
|
|
|
+ coords_defaults=coords_defaults,
|
|
|
+ coords_terminologies=coords_terminologies,
|
|
|
+ coords_value_mapping=coords_value_mapping,
|
|
|
+ filter_remove=filter_remove,
|
|
|
+ meta_data=meta_data,
|
|
|
+ convert_str=True,
|
|
|
+ time_format="%Y",
|
|
|
+ )
|
|
|
+
|
|
|
+ ### convert to primap2 format ###
|
|
|
+ print("Converting to primap2 format.")
|
|
|
+ data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
|
|
|
|
|
|
# ###
|
|
|
- # 2.5 Read harvested wood products table
|
|
|
+ # 2. Read in trend tables
|
|
|
+ # ###
|
|
|
+
|
|
|
+ df_trend = None
|
|
|
+ for entity in inv_conf_per_entity.keys():
|
|
|
+ print("-" * 60)
|
|
|
+ print(f"Reading entity {entity}.")
|
|
|
+
|
|
|
+ df_entity = None
|
|
|
+
|
|
|
+ for page in inv_conf_per_entity[entity]["page_defs"].keys():
|
|
|
+ print(f"Reading page {page}.")
|
|
|
+
|
|
|
+ tables_inventory_original = camelot.read_pdf(
|
|
|
+ str(input_folder / pdf_file),
|
|
|
+ pages=page,
|
|
|
+ table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
|
|
|
+ columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
|
|
|
+ flavor="stream",
|
|
|
+ split_text=True,
|
|
|
+ )
|
|
|
+ df_page = tables_inventory_original[0].df
|
|
|
+
|
|
|
+ if df_entity is None:
|
|
|
+ df_entity = df_page
|
|
|
+ else:
|
|
|
+ df_entity = pd.concat(
|
|
|
+ [df_entity, df_page],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
+ print(f"adding table from page {page}.")
|
|
|
+
|
|
|
+ if "rows_to_fix" in inv_conf_per_entity[entity]:
|
|
|
+ for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
|
|
|
+ print(f"Merge content for {n_rows=}")
|
|
|
+ df_entity = fix_rows(
|
|
|
+ df_entity,
|
|
|
+ rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
|
|
|
+ col_to_use=0,
|
|
|
+ n_rows=n_rows,
|
|
|
+ )
|
|
|
+
|
|
|
+ df_entity.columns = df_entity.iloc[0, :]
|
|
|
+ # make a copy to avoid SettingWithCopyWarning
|
|
|
+ df_entity = df_entity[1:].copy()
|
|
|
+
|
|
|
+ # unit is always Gg
|
|
|
+ df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
|
|
|
+
|
|
|
+ # only one entity per table
|
|
|
+ df_entity.loc[:, "entity"] = entity
|
|
|
+
|
|
|
+ # TODO: Fix pandas "set value on slice of copy" warning
|
|
|
+ df_entity.loc[:, "category"] = df_entity.loc[
|
|
|
+ :, inv_conf_per_entity[entity]["category_column"]
|
|
|
+ ]
|
|
|
+
|
|
|
+ if "rows_to_drop" in inv_conf_per_entity[entity]:
|
|
|
+ for row in inv_conf_per_entity[entity]["rows_to_drop"]:
|
|
|
+ row_to_delete = df_entity.index[df_entity["category"] == row][0]
|
|
|
+ df_entity = df_entity.drop(index=row_to_delete)
|
|
|
+
|
|
|
+ df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
|
|
|
+ inv_conf_per_entity[entity]["cat_codes_manual"]
|
|
|
+ )
|
|
|
+
|
|
|
+ df_entity.loc[:, "category"] = df_entity["category"].str.replace(
|
|
|
+ inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
+ )
|
|
|
+
|
|
|
+ df_entity = df_entity.drop(
|
|
|
+ columns=inv_conf_per_entity[entity]["columns_to_drop"]
|
|
|
+ )
|
|
|
+
|
|
|
+ for year in inv_conf_per_entity[entity]["years"]:
|
|
|
+ df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
|
|
|
+
|
|
|
+ if df_trend is None:
|
|
|
+ df_trend = df_entity
|
|
|
+ else:
|
|
|
+ df_trend = pd.concat(
|
|
|
+ [df_trend, df_entity],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ ### convert to interchange format ###
|
|
|
+ df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
+ data_wide=df_trend,
|
|
|
+ coords_cols=coords_cols,
|
|
|
+ coords_defaults=coords_defaults,
|
|
|
+ coords_terminologies=coords_terminologies,
|
|
|
+ coords_value_mapping=coords_value_mapping,
|
|
|
+ # filter_remove=filter_remove,
|
|
|
+ meta_data=meta_data,
|
|
|
+ convert_str=True,
|
|
|
+ time_format="%Y",
|
|
|
+ )
|
|
|
+
|
|
|
+ ### convert to primap2 format ###
|
|
|
+ print("Converting to primap2 format.")
|
|
|
+ data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
|
|
|
+
|
|
|
+ # ###
|
|
|
+ # 3 Read harvested wood products table
|
|
|
# ###
|
|
|
|
|
|
# The table for harvested wood products is in a different format
|
|
|
# and needs to be read in separately.
|
|
|
|
|
|
print("-" * 60)
|
|
|
- print(
|
|
|
- "Reading sector harvested wood products table."
|
|
|
- )
|
|
|
+ print("Reading sector harvested wood products table.")
|
|
|
+ print("-" * 60)
|
|
|
|
|
|
df_hwp = None
|
|
|
- for part in [*inv_conf_harvested_wood_products['parts']] :
|
|
|
+ for part in [*inv_conf_harvested_wood_products["parts"]]:
|
|
|
tables_inventory_original = camelot.read_pdf(
|
|
|
str(input_folder / pdf_file),
|
|
|
- pages=inv_conf_harvested_wood_products['page'],
|
|
|
- table_areas=inv_conf_harvested_wood_products['parts'][part]["page_defs"]["area"],
|
|
|
- columns=inv_conf_harvested_wood_products['parts'][part]["page_defs"]["cols"],
|
|
|
+ pages=inv_conf_harvested_wood_products["page"],
|
|
|
+ table_areas=inv_conf_harvested_wood_products["parts"][part]["page_defs"][
|
|
|
+ "area"
|
|
|
+ ],
|
|
|
+ columns=inv_conf_harvested_wood_products["parts"][part]["page_defs"][
|
|
|
+ "cols"
|
|
|
+ ],
|
|
|
flavor="stream",
|
|
|
split_text=True,
|
|
|
)
|
|
|
|
|
|
df_hwp_part = tables_inventory_original[0].df
|
|
|
|
|
|
- if "rows_to_fix" in inv_conf_harvested_wood_products['parts'][part] :
|
|
|
- for n_rows in inv_conf_harvested_wood_products[
|
|
|
- 'parts'][part]["rows_to_fix"].keys():
|
|
|
+ if "rows_to_fix" in inv_conf_harvested_wood_products["parts"][part]:
|
|
|
+ for n_rows in inv_conf_harvested_wood_products["parts"][part][
|
|
|
+ "rows_to_fix"
|
|
|
+ ].keys():
|
|
|
df_hwp_part = fix_rows(
|
|
|
df_hwp_part,
|
|
|
- rows_to_fix=inv_conf_harvested_wood_products['parts'][part]["rows_to_fix"][n_rows],
|
|
|
+ rows_to_fix=inv_conf_harvested_wood_products["parts"][part][
|
|
|
+ "rows_to_fix"
|
|
|
+ ][n_rows],
|
|
|
col_to_use=0,
|
|
|
n_rows=n_rows,
|
|
|
)
|
|
|
|
|
|
df_hwp_part = df_hwp_part.drop(1, axis=0).reset_index(drop=True)
|
|
|
|
|
|
- if df_hwp is None :
|
|
|
+ if df_hwp is None:
|
|
|
df_hwp = df_hwp_part
|
|
|
- else :
|
|
|
+ else:
|
|
|
+ # stack horizontally
|
|
|
df_hwp = pd.concat(
|
|
|
[df_hwp, df_hwp_part.drop(0, axis=1)],
|
|
|
axis=1,
|
|
|
join="outer",
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
- df_hwp = pd.DataFrame(df_hwp.values[1 :], columns=df_hwp.iloc[0])
|
|
|
+ # assign the years to the columns
|
|
|
+ df_hwp = pd.DataFrame(df_hwp.values[1:], columns=df_hwp.iloc[0])
|
|
|
|
|
|
df_hwp = df_hwp.rename(
|
|
|
- columns={inv_conf_harvested_wood_products["category_column"] : "category"}
|
|
|
+ columns={inv_conf_harvested_wood_products["category_column"]: "category"}
|
|
|
)
|
|
|
|
|
|
df_hwp.loc[:, "category"] = df_hwp.loc[:, "category"].replace(
|
|
@@ -344,12 +360,12 @@ if __name__ == "__main__" :
|
|
|
df_hwp.loc[:, "entity"] = inv_conf_harvested_wood_products["entity"]
|
|
|
|
|
|
# ###
|
|
|
- # 3. Read in aggregated tables from 1990 - 2020
|
|
|
+ # 4. Read in aggregated tables from 1990 - 2020
|
|
|
# ###
|
|
|
|
|
|
df_agg = None
|
|
|
- # TODO remove `reversed` (only for development)
|
|
|
- for sector in list(reversed(list(inv_conf_per_sector.keys()))) :
|
|
|
+
|
|
|
+ for sector in list(inv_conf_per_sector.keys()):
|
|
|
print("-" * 60)
|
|
|
print(
|
|
|
f"Reading sector {sector} on page(s) {[*inv_conf_per_sector[sector]['page_defs']]}."
|
|
@@ -357,7 +373,7 @@ if __name__ == "__main__" :
|
|
|
|
|
|
df_sector = None
|
|
|
|
|
|
- for page in [*inv_conf_per_sector[sector]["page_defs"]] :
|
|
|
+ for page in [*inv_conf_per_sector[sector]["page_defs"]]:
|
|
|
tables_inventory_original = camelot.read_pdf(
|
|
|
str(input_folder / pdf_file),
|
|
|
pages=page,
|
|
@@ -369,9 +385,9 @@ if __name__ == "__main__" :
|
|
|
|
|
|
df_sector_page = tables_inventory_original[0].df
|
|
|
|
|
|
- if df_sector is None :
|
|
|
+ if df_sector is None:
|
|
|
df_sector = df_sector_page
|
|
|
- else :
|
|
|
+ else:
|
|
|
df_sector = pd.concat(
|
|
|
[df_sector, df_sector_page],
|
|
|
axis=0,
|
|
@@ -384,13 +400,13 @@ if __name__ == "__main__" :
|
|
|
|
|
|
df_sector = df_sector[0 : last_row + 1]
|
|
|
|
|
|
- if "rows_to_fix" in inv_conf_per_sector[sector] :
|
|
|
- for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys() :
|
|
|
+ if "rows_to_fix" in inv_conf_per_sector[sector]:
|
|
|
+ for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys():
|
|
|
print(f"Merge content for {n_rows=}")
|
|
|
# set the row
|
|
|
- if "col_to_use" in inv_conf_per_sector[sector].keys() :
|
|
|
+ if "col_to_use" in inv_conf_per_sector[sector].keys():
|
|
|
col_to_use = inv_conf_per_sector[sector]["col_to_use"]
|
|
|
- else :
|
|
|
+ else:
|
|
|
col_to_use = 0
|
|
|
df_sector = fix_rows(
|
|
|
df_sector,
|
|
@@ -401,8 +417,8 @@ if __name__ == "__main__" :
|
|
|
|
|
|
df_sector = df_sector.reset_index(drop=True)
|
|
|
|
|
|
- if "rows_to_drop" in inv_conf_per_sector[sector] :
|
|
|
- for row in inv_conf_per_sector[sector]["rows_to_drop"] :
|
|
|
+ if "rows_to_drop" in inv_conf_per_sector[sector]:
|
|
|
+ for row in inv_conf_per_sector[sector]["rows_to_drop"]:
|
|
|
df_sector = df_sector.drop(index=row)
|
|
|
|
|
|
# TODO: Is it necessary to set the index here?
|
|
@@ -412,14 +428,14 @@ if __name__ == "__main__" :
|
|
|
df_sector = df_sector.T
|
|
|
|
|
|
df_sector = df_sector.rename(
|
|
|
- columns={inv_conf_per_sector[sector]["year_column"] : "category"}
|
|
|
+ columns={inv_conf_per_sector[sector]["year_column"]: "category"}
|
|
|
)
|
|
|
|
|
|
df_sector["category"] = df_sector["category"].str.replace("\n", "")
|
|
|
|
|
|
# TODO Is it not the same as remove categories further down?
|
|
|
- if "categories_to_drop" in inv_conf_per_sector[sector] :
|
|
|
- for row in inv_conf_per_sector[sector]["categories_to_drop"] :
|
|
|
+ if "categories_to_drop" in inv_conf_per_sector[sector]:
|
|
|
+ for row in inv_conf_per_sector[sector]["categories_to_drop"]:
|
|
|
row_to_delete = df_sector.index[df_sector["category"] == row][0]
|
|
|
df_sector = df_sector.drop(index=row_to_delete)
|
|
|
|
|
@@ -427,12 +443,12 @@ if __name__ == "__main__" :
|
|
|
inv_conf_per_sector[sector]["cat_codes_manual"]
|
|
|
)
|
|
|
|
|
|
- if "multi_entity" in inv_conf_per_sector[sector] :
|
|
|
+ if "multi_entity" in inv_conf_per_sector[sector]:
|
|
|
df_sector["entity"] = inv_conf_per_sector[sector]["multi_entity"]["entity"]
|
|
|
df_sector["unit"] = inv_conf_per_sector[sector]["multi_entity"]["unit"]
|
|
|
# df_sector = df_sector.set_index(["entity", "unit", "category"])
|
|
|
|
|
|
- else :
|
|
|
+ else:
|
|
|
# unit is always the same
|
|
|
df_sector.loc[:, "unit"] = inv_conf_per_sector[sector]["unit"]
|
|
|
|
|
@@ -441,25 +457,23 @@ if __name__ == "__main__" :
|
|
|
|
|
|
# Some categories are in two tables (summary and sector)
|
|
|
# Duplicates need to be removed
|
|
|
- if "remove_duplicates" in inv_conf_per_sector[sector] :
|
|
|
- for row in inv_conf_per_sector[sector]["remove_duplicates"] :
|
|
|
+ if "remove_duplicates" in inv_conf_per_sector[sector]:
|
|
|
+ for row in inv_conf_per_sector[sector]["remove_duplicates"]:
|
|
|
row_to_delete = df_sector.index[df_sector["category"] == row][0]
|
|
|
df_sector = df_sector.drop(index=row_to_delete)
|
|
|
|
|
|
- if df_agg is None :
|
|
|
+ if df_agg is None:
|
|
|
df_agg = df_sector
|
|
|
- else :
|
|
|
+ else:
|
|
|
df_agg = pd.concat(
|
|
|
[df_agg, df_sector],
|
|
|
axis=0,
|
|
|
join="outer",
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
- for year in [str(y) for y in range(1990, 2021)] :
|
|
|
+ for year in [str(y) for y in range(1990, 2021)]:
|
|
|
df_agg.loc[:, year] = df_agg[year].str.replace(",", "")
|
|
|
|
|
|
- # print(df_agg)
|
|
|
-
|
|
|
# add harvested wood products table and all the other sectors together
|
|
|
df_agg = pd.concat(
|
|
|
[df_agg, df_hwp],
|
|
@@ -484,65 +498,78 @@ if __name__ == "__main__" :
|
|
|
print("Converting to primap2 format.")
|
|
|
data_agg_pm2 = pm2.pm2io.from_interchange_format(df_agg_IF)
|
|
|
|
|
|
- pass
|
|
|
# # ###
|
|
|
- # # Merge main and trend tables.
|
|
|
+ # # Merge tables.
|
|
|
# # ###
|
|
|
- #
|
|
|
- # print("Merging main and trend table.")
|
|
|
- # data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
|
|
|
- #
|
|
|
+
|
|
|
+ print("Merging main and trend table.")
|
|
|
+ data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
|
|
|
+
|
|
|
+ print("Merging sector tables.")
|
|
|
+ data_pm2 = data_pm2.pr.merge(data_agg_pm2, tolerance=1)
|
|
|
+
|
|
|
# # ###
|
|
|
# # Save raw data to IF and native format.
|
|
|
# # ###
|
|
|
- #
|
|
|
- # data_if = data_pm2.pr.to_interchange_format()
|
|
|
- #
|
|
|
- # pm2.pm2io.write_interchange_format(
|
|
|
- # output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
|
|
|
- # data_if,
|
|
|
- # )
|
|
|
- #
|
|
|
- # encoding = {var: compression for var in data_pm2.data_vars}
|
|
|
- # data_pm2.pr.to_netcdf(
|
|
|
- # output_folder
|
|
|
- # / (output_filename + coords_terminologies["category"] + "_raw.nc"),
|
|
|
- # encoding=encoding,
|
|
|
- # )
|
|
|
- #
|
|
|
+
|
|
|
+ data_if = data_pm2.pr.to_interchange_format()
|
|
|
+
|
|
|
+ pm2.pm2io.write_interchange_format(
|
|
|
+ output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
|
|
|
+ data_if,
|
|
|
+ )
|
|
|
+
|
|
|
+ encoding = {var: compression for var in data_pm2.data_vars}
|
|
|
+ data_pm2.pr.to_netcdf(
|
|
|
+ output_folder
|
|
|
+ / (output_filename + coords_terminologies["category"] + "_raw.nc"),
|
|
|
+ encoding=encoding,
|
|
|
+ )
|
|
|
+
|
|
|
# # ###
|
|
|
# # Processing
|
|
|
# # ###
|
|
|
- #
|
|
|
- # data_proc_pm2 = process_data_for_country(
|
|
|
- # data_country=data_pm2,
|
|
|
- # entities_to_ignore=[],
|
|
|
- # gas_baskets=gas_baskets,
|
|
|
- # filter_dims=None,
|
|
|
- # cat_terminology_out=None,
|
|
|
- # category_conversion=None,
|
|
|
- # sectors_out=None,
|
|
|
- # processing_info_country=country_processing_step1,
|
|
|
- # )
|
|
|
- #
|
|
|
+
|
|
|
+ # create the gas baskets before aggregating the categories
|
|
|
+ data_proc_pm2_gas_baskets = process_data_for_country(
|
|
|
+ data_country=data_pm2,
|
|
|
+ entities_to_ignore=[],
|
|
|
+ gas_baskets=gas_baskets,
|
|
|
+ filter_dims=None,
|
|
|
+ cat_terminology_out=None,
|
|
|
+ category_conversion=None,
|
|
|
+ sectors_out=None,
|
|
|
+ processing_info_country=None,
|
|
|
+ )
|
|
|
+
|
|
|
+ data_proc_pm2 = process_data_for_country(
|
|
|
+ data_country=data_proc_pm2_gas_baskets,
|
|
|
+ entities_to_ignore=[],
|
|
|
+ gas_baskets=None,
|
|
|
+ filter_dims=None,
|
|
|
+ cat_terminology_out=None,
|
|
|
+ category_conversion=None,
|
|
|
+ sectors_out=None,
|
|
|
+ processing_info_country=country_processing_step1,
|
|
|
+ )
|
|
|
+
|
|
|
# # ###
|
|
|
# # save processed data to IF and native format
|
|
|
# # ###
|
|
|
- #
|
|
|
- # terminology_proc = coords_terminologies["category"]
|
|
|
- #
|
|
|
- # data_proc_if = data_proc_pm2.pr.to_interchange_format()
|
|
|
- #
|
|
|
- # if not output_folder.exists():
|
|
|
- # output_folder.mkdir()
|
|
|
- # pm2.pm2io.write_interchange_format(
|
|
|
- # output_folder / (output_filename + terminology_proc), data_proc_if
|
|
|
- # )
|
|
|
- #
|
|
|
- # encoding = {var: compression for var in data_proc_pm2.data_vars}
|
|
|
- # data_proc_pm2.pr.to_netcdf(
|
|
|
- # output_folder / (output_filename + terminology_proc + ".nc"),
|
|
|
- # encoding=encoding
|
|
|
- # )
|
|
|
- #
|
|
|
- # print("Saved processed data.")
|
|
|
+
|
|
|
+ terminology_proc = coords_terminologies["category"]
|
|
|
+
|
|
|
+ data_proc_if = data_proc_pm2.pr.to_interchange_format()
|
|
|
+
|
|
|
+ if not output_folder.exists():
|
|
|
+ output_folder.mkdir()
|
|
|
+ pm2.pm2io.write_interchange_format(
|
|
|
+ output_folder / (output_filename + terminology_proc), data_proc_if
|
|
|
+ )
|
|
|
+
|
|
|
+ encoding = {var: compression for var in data_proc_pm2.data_vars}
|
|
|
+ data_proc_pm2.pr.to_netcdf(
|
|
|
+ output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
|
|
|
+ )
|
|
|
+
|
|
|
+ print("Saved processed data.")
|