|
@@ -1,29 +1,23 @@
|
|
|
"""
|
|
|
Read Mongolia's BUR2 from pdf
|
|
|
"""
|
|
|
-import camelot
|
|
|
-import pandas as pd
|
|
|
-import primap2 as pm2
|
|
|
+# TODO: Delete when this is finished
|
|
|
+import os
|
|
|
|
|
|
-from unfccc_ghg_data.helper import (
|
|
|
+os.environ[
|
|
|
+ "UNFCCC_GHG_ROOT_PATH"
|
|
|
+] = "/Users/danielbusch/Documents/UNFCCC_non-AnnexI_data/"
|
|
|
+
|
|
|
+import camelot # noqa: E402
|
|
|
+import pandas as pd # noqa: E402
|
|
|
+from config_mng_bur2 import ( # noqa: E402
|
|
|
+ coords_terminologies,
|
|
|
+)
|
|
|
+
|
|
|
+from unfccc_ghg_data.helper import ( # noqa: E402
|
|
|
downloaded_data_path,
|
|
|
extracted_data_path,
|
|
|
fix_rows,
|
|
|
- process_data_for_country,
|
|
|
-)
|
|
|
-
|
|
|
-from .config_mng_bur2 import (
|
|
|
- coords_cols,
|
|
|
- coords_defaults,
|
|
|
- coords_terminologies,
|
|
|
- coords_value_mapping,
|
|
|
- country_processing_step1,
|
|
|
- filter_remove,
|
|
|
- gas_baskets,
|
|
|
- inv_conf,
|
|
|
- inv_conf_per_entity,
|
|
|
- inv_conf_per_year,
|
|
|
- meta_data,
|
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
@@ -49,242 +43,243 @@ if __name__ == "__main__":
|
|
|
# 1. Read in main tables
|
|
|
# ###
|
|
|
|
|
|
- df_main = None
|
|
|
- for year in inv_conf_per_year.keys():
|
|
|
- print("-" * 60)
|
|
|
- print(f"Reading year {year}.")
|
|
|
- print("-" * 60)
|
|
|
- df_year = None
|
|
|
- for page in inv_conf_per_year[year]["page_defs"].keys():
|
|
|
- print(f"Reading table from page {page}.")
|
|
|
- tables_inventory_original = camelot.read_pdf(
|
|
|
- str(input_folder / pdf_file),
|
|
|
- pages=page,
|
|
|
- table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
|
|
|
- columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
|
|
|
- flavor="stream",
|
|
|
- split_text=True,
|
|
|
- )
|
|
|
- print("Reading complete.")
|
|
|
-
|
|
|
- df_page = tables_inventory_original[0].df
|
|
|
-
|
|
|
- if df_year is None:
|
|
|
- df_year = df_page
|
|
|
- else:
|
|
|
- df_year = pd.concat(
|
|
|
- [df_year, df_page],
|
|
|
- axis=0,
|
|
|
- join="outer",
|
|
|
- ).reset_index(drop=True)
|
|
|
-
|
|
|
- print(f"Concatenating all tables for {year}.")
|
|
|
-
|
|
|
- # fix content that spreads across multiple rows
|
|
|
- if "rows_to_fix" in inv_conf_per_year[year]:
|
|
|
- for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
|
|
|
- print(f"Merge content for {n_rows=}")
|
|
|
- df_year = fix_rows(
|
|
|
- df_year,
|
|
|
- rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
|
|
|
- col_to_use=0,
|
|
|
- n_rows=n_rows,
|
|
|
- )
|
|
|
-
|
|
|
- df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
|
|
|
-
|
|
|
- skip_rows = 11
|
|
|
- df_year = pd.concat(
|
|
|
- [df_header, df_year[skip_rows:]], axis=0, join="outer"
|
|
|
- ).reset_index(drop=True)
|
|
|
-
|
|
|
- df_year = pm2.pm2io.nir_add_unit_information(
|
|
|
- df_year,
|
|
|
- unit_row=inv_conf["unit_row"],
|
|
|
- entity_row=inv_conf["entity_row"],
|
|
|
- regexp_entity=".*",
|
|
|
- regexp_unit=".*",
|
|
|
- default_unit="Gg",
|
|
|
- )
|
|
|
-
|
|
|
- print("Added unit information.")
|
|
|
-
|
|
|
- # set index
|
|
|
- df_year = df_year.set_index(inv_conf["index_cols"])
|
|
|
-
|
|
|
- # convert to long format
|
|
|
- df_year_long = pm2.pm2io.nir_convert_df_to_long(
|
|
|
- df_year, year, inv_conf["header_long"]
|
|
|
- )
|
|
|
-
|
|
|
- # extract from tuple
|
|
|
- df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
|
|
|
-
|
|
|
- # prep for conversion to PM2 IF and native format
|
|
|
- # make a copy of the categories row
|
|
|
- df_year_long["category"] = df_year_long["orig_cat_name"]
|
|
|
-
|
|
|
- # replace cat names by codes in col "category"
|
|
|
- # first the manual replacements
|
|
|
-
|
|
|
- df_year_long["category"] = df_year_long["category"].replace(
|
|
|
- inv_conf["cat_codes_manual"]
|
|
|
- )
|
|
|
-
|
|
|
- df_year_long["category"] = df_year_long["category"].str.replace(".", "")
|
|
|
-
|
|
|
- # then the regex replacements
|
|
|
- df_year_long["category"] = df_year_long["category"].str.replace(
|
|
|
- inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
- )
|
|
|
-
|
|
|
- df_year_long = df_year_long.reset_index(drop=True)
|
|
|
-
|
|
|
- df_year_long["data"] = df_year_long["data"].str.replace(",", "")
|
|
|
-
|
|
|
- # make sure all col headers are str
|
|
|
- df_year_long.columns = df_year_long.columns.map(str)
|
|
|
-
|
|
|
- df_year_long = df_year_long.drop(columns=["orig_cat_name"])
|
|
|
-
|
|
|
- if df_main is None:
|
|
|
- df_main = df_year_long
|
|
|
- else:
|
|
|
- df_main = pd.concat(
|
|
|
- [df_main, df_year_long],
|
|
|
- axis=0,
|
|
|
- join="outer",
|
|
|
- ).reset_index(drop=True)
|
|
|
-
|
|
|
- ### convert to interchange format ###
|
|
|
- print("Converting to interchange format.")
|
|
|
- df_main_IF = pm2.pm2io.convert_long_dataframe_if(
|
|
|
- df_main,
|
|
|
- coords_cols=coords_cols,
|
|
|
- coords_defaults=coords_defaults,
|
|
|
- coords_terminologies=coords_terminologies,
|
|
|
- coords_value_mapping=coords_value_mapping,
|
|
|
- filter_remove=filter_remove,
|
|
|
- meta_data=meta_data,
|
|
|
- convert_str=True,
|
|
|
- time_format="%Y",
|
|
|
- )
|
|
|
-
|
|
|
- ### convert to primap2 format ###
|
|
|
- print("Converting to primap2 format.")
|
|
|
- data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
|
|
|
-
|
|
|
- # ###
|
|
|
- # 2. Read in trend tables
|
|
|
- # ###
|
|
|
-
|
|
|
- df_trend = None
|
|
|
- for entity in inv_conf_per_entity.keys():
|
|
|
- print("-" * 60)
|
|
|
- print(f"Reading entity {entity}.")
|
|
|
-
|
|
|
- df_entity = None
|
|
|
-
|
|
|
- for page in inv_conf_per_entity[entity]["page_defs"].keys():
|
|
|
- print(f"Reading page {page}.")
|
|
|
-
|
|
|
- tables_inventory_original = camelot.read_pdf(
|
|
|
- str(input_folder / pdf_file),
|
|
|
- pages=page,
|
|
|
- table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
|
|
|
- columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
|
|
|
- flavor="stream",
|
|
|
- split_text=True,
|
|
|
- )
|
|
|
- df_page = tables_inventory_original[0].df
|
|
|
-
|
|
|
- if df_entity is None:
|
|
|
- df_entity = df_page
|
|
|
- else:
|
|
|
- df_entity = pd.concat(
|
|
|
- [df_entity, df_page],
|
|
|
- axis=0,
|
|
|
- join="outer",
|
|
|
- ).reset_index(drop=True)
|
|
|
- print(f"adding table from page {page}.")
|
|
|
-
|
|
|
- if "rows_to_fix" in inv_conf_per_entity[entity]:
|
|
|
- for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
|
|
|
- print(f"Merge content for {n_rows=}")
|
|
|
- df_entity = fix_rows(
|
|
|
- df_entity,
|
|
|
- rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
|
|
|
- col_to_use=0,
|
|
|
- n_rows=n_rows,
|
|
|
- )
|
|
|
-
|
|
|
- df_entity.columns = df_entity.iloc[0, :]
|
|
|
- df_entity = df_entity[1:]
|
|
|
-
|
|
|
- # unit is always Gg
|
|
|
- df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
|
|
|
-
|
|
|
- # only one entity per table
|
|
|
- df_entity.loc[:, "entity"] = entity
|
|
|
-
|
|
|
- # TODO: Fix pandas "set value on slice of copy" warning
|
|
|
- df_entity.loc[:, "category"] = df_entity.loc[
|
|
|
- :, inv_conf_per_entity[entity]["category_column"]
|
|
|
- ]
|
|
|
-
|
|
|
- if "rows_to_drop" in inv_conf_per_entity[entity]:
|
|
|
- for row in inv_conf_per_entity[entity]["rows_to_drop"]:
|
|
|
- row_to_delete = df_entity.index[df_entity["category"] == row][0]
|
|
|
- df_entity = df_entity.drop(index=row_to_delete)
|
|
|
-
|
|
|
- df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
|
|
|
- inv_conf_per_entity[entity]["cat_codes_manual"]
|
|
|
- )
|
|
|
-
|
|
|
- df_entity.loc[:, "category"] = df_entity["category"].str.replace(
|
|
|
- inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
- )
|
|
|
-
|
|
|
- df_entity = df_entity.drop(
|
|
|
- columns=inv_conf_per_entity[entity]["columns_to_drop"]
|
|
|
- )
|
|
|
-
|
|
|
- for year in inv_conf_per_entity[entity]["years"]:
|
|
|
- df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
|
|
|
-
|
|
|
- if df_trend is None:
|
|
|
- df_trend = df_entity
|
|
|
- else:
|
|
|
- df_trend = pd.concat(
|
|
|
- [df_trend, df_entity],
|
|
|
- axis=0,
|
|
|
- join="outer",
|
|
|
- ).reset_index(drop=True)
|
|
|
-
|
|
|
- ### convert to interchange format ###
|
|
|
- df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
- data_wide=df_trend,
|
|
|
- coords_cols=coords_cols,
|
|
|
- coords_defaults=coords_defaults,
|
|
|
- coords_terminologies=coords_terminologies,
|
|
|
- coords_value_mapping=coords_value_mapping,
|
|
|
- # filter_remove=filter_remove,
|
|
|
- meta_data=meta_data,
|
|
|
- convert_str=True,
|
|
|
- time_format="%Y",
|
|
|
- )
|
|
|
-
|
|
|
- ### convert to primap2 format ###
|
|
|
- print("Converting to primap2 format.")
|
|
|
- data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
|
|
|
+ # df_main = None
|
|
|
+ # for year in inv_conf_per_year.keys():
|
|
|
+ # print("-" * 60)
|
|
|
+ # print(f"Reading year {year}.")
|
|
|
+ # print("-" * 60)
|
|
|
+ # df_year = None
|
|
|
+ # for page in inv_conf_per_year[year]["page_defs"].keys():
|
|
|
+ # print(f"Reading table from page {page}.")
|
|
|
+ # tables_inventory_original = camelot.read_pdf(
|
|
|
+ # str(input_folder / pdf_file),
|
|
|
+ # pages=page,
|
|
|
+ # table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
|
|
|
+ # columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
|
|
|
+ # flavor="stream",
|
|
|
+ # split_text=True,
|
|
|
+ # )
|
|
|
+ # print("Reading complete.")
|
|
|
+ #
|
|
|
+ # df_page = tables_inventory_original[0].df
|
|
|
+ #
|
|
|
+ # if df_year is None:
|
|
|
+ # df_year = df_page
|
|
|
+ # else:
|
|
|
+ # df_year = pd.concat(
|
|
|
+ # [df_year, df_page],
|
|
|
+ # axis=0,
|
|
|
+ # join="outer",
|
|
|
+ # ).reset_index(drop=True)
|
|
|
+ #
|
|
|
+ # print(f"Concatenating all tables for {year}.")
|
|
|
+ #
|
|
|
+ # # fix content that spreads across multiple rows
|
|
|
+ # if "rows_to_fix" in inv_conf_per_year[year]:
|
|
|
+ # for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
|
|
|
+ # print(f"Merge content for {n_rows=}")
|
|
|
+ # df_year = fix_rows(
|
|
|
+ # df_year,
|
|
|
+ # rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
|
|
|
+ # col_to_use=0,
|
|
|
+ # n_rows=n_rows,
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
|
|
|
+ #
|
|
|
+ # skip_rows = 11
|
|
|
+ # df_year = pd.concat(
|
|
|
+ # [df_header, df_year[skip_rows:]], axis=0, join="outer"
|
|
|
+ # ).reset_index(drop=True)
|
|
|
+ #
|
|
|
+ # df_year = pm2.pm2io.nir_add_unit_information(
|
|
|
+ # df_year,
|
|
|
+ # unit_row=inv_conf["unit_row"],
|
|
|
+ # entity_row=inv_conf["entity_row"],
|
|
|
+ # regexp_entity=".*",
|
|
|
+ # regexp_unit=".*",
|
|
|
+ # default_unit="Gg",
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # print("Added unit information.")
|
|
|
+ #
|
|
|
+ # # set index
|
|
|
+ # df_year = df_year.set_index(inv_conf["index_cols"])
|
|
|
+ #
|
|
|
+ # # convert to long format
|
|
|
+ # df_year_long = pm2.pm2io.nir_convert_df_to_long(
|
|
|
+ # df_year, year, inv_conf["header_long"]
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # # extract from tuple
|
|
|
+ # df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
|
|
|
+ #
|
|
|
+ # # prep for conversion to PM2 IF and native format
|
|
|
+ # # make a copy of the categories row
|
|
|
+ # df_year_long["category"] = df_year_long["orig_cat_name"]
|
|
|
+ #
|
|
|
+ # # replace cat names by codes in col "category"
|
|
|
+ # # first the manual replacements
|
|
|
+ #
|
|
|
+ # df_year_long["category"] = df_year_long["category"].replace(
|
|
|
+ # inv_conf["cat_codes_manual"]
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_year_long["category"] = df_year_long["category"].str.replace(".", "")
|
|
|
+ #
|
|
|
+ # # then the regex replacements
|
|
|
+ # df_year_long["category"] = df_year_long["category"].str.replace(
|
|
|
+ # inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_year_long = df_year_long.reset_index(drop=True)
|
|
|
+ #
|
|
|
+ # df_year_long["data"] = df_year_long["data"].str.replace(",", "")
|
|
|
+ #
|
|
|
+ # # make sure all col headers are str
|
|
|
+ # df_year_long.columns = df_year_long.columns.map(str)
|
|
|
+ #
|
|
|
+ # df_year_long = df_year_long.drop(columns=["orig_cat_name"])
|
|
|
+ #
|
|
|
+ # if df_main is None:
|
|
|
+ # df_main = df_year_long
|
|
|
+ # else:
|
|
|
+ # df_main = pd.concat(
|
|
|
+ # [df_main, df_year_long],
|
|
|
+ # axis=0,
|
|
|
+ # join="outer",
|
|
|
+ # ).reset_index(drop=True)
|
|
|
+ #
|
|
|
+ # ### convert to interchange format ###
|
|
|
+ # print("Converting to interchange format.")
|
|
|
+ # df_main_IF = pm2.pm2io.convert_long_dataframe_if(
|
|
|
+ # df_main,
|
|
|
+ # coords_cols=coords_cols,
|
|
|
+ # coords_defaults=coords_defaults,
|
|
|
+ # coords_terminologies=coords_terminologies,
|
|
|
+ # coords_value_mapping=coords_value_mapping,
|
|
|
+ # filter_remove=filter_remove,
|
|
|
+ # meta_data=meta_data,
|
|
|
+ # convert_str=True,
|
|
|
+ # time_format="%Y",
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # ### convert to primap2 format ###
|
|
|
+ # print("Converting to primap2 format.")
|
|
|
+ # data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
|
|
|
+ #
|
|
|
+ # # ###
|
|
|
+ # # 2. Read in trend tables
|
|
|
+ # # ###
|
|
|
+ #
|
|
|
+ # df_trend = None
|
|
|
+ # for entity in inv_conf_per_entity.keys():
|
|
|
+ # print("-" * 60)
|
|
|
+ # print(f"Reading entity {entity}.")
|
|
|
+ #
|
|
|
+ # df_entity = None
|
|
|
+ #
|
|
|
+ # for page in inv_conf_per_entity[entity]["page_defs"].keys():
|
|
|
+ # print(f"Reading page {page}.")
|
|
|
+ #
|
|
|
+ # tables_inventory_original = camelot.read_pdf(
|
|
|
+ # str(input_folder / pdf_file),
|
|
|
+ # pages=page,
|
|
|
+ # table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
|
|
|
+ # columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
|
|
|
+ # flavor="stream",
|
|
|
+ # split_text=True,
|
|
|
+ # )
|
|
|
+ # df_page = tables_inventory_original[0].df
|
|
|
+ #
|
|
|
+ # if df_entity is None:
|
|
|
+ # df_entity = df_page
|
|
|
+ # else:
|
|
|
+ # df_entity = pd.concat(
|
|
|
+ # [df_entity, df_page],
|
|
|
+ # axis=0,
|
|
|
+ # join="outer",
|
|
|
+ # ).reset_index(drop=True)
|
|
|
+ # print(f"adding table from page {page}.")
|
|
|
+ #
|
|
|
+ # if "rows_to_fix" in inv_conf_per_entity[entity]:
|
|
|
+ # for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
|
|
|
+ # print(f"Merge content for {n_rows=}")
|
|
|
+ # df_entity = fix_rows(
|
|
|
+ # df_entity,
|
|
|
+ # rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
|
|
|
+ # col_to_use=0,
|
|
|
+ # n_rows=n_rows,
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_entity.columns = df_entity.iloc[0, :]
|
|
|
+ # df_entity = df_entity[1:]
|
|
|
+ #
|
|
|
+ # # unit is always Gg
|
|
|
+ # df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
|
|
|
+ #
|
|
|
+ # # only one entity per table
|
|
|
+ # df_entity.loc[:, "entity"] = entity
|
|
|
+ #
|
|
|
+ # # TODO: Fix pandas "set value on slice of copy" warning
|
|
|
+ # df_entity.loc[:, "category"] = df_entity.loc[
|
|
|
+ # :, inv_conf_per_entity[entity]["category_column"]
|
|
|
+ # ]
|
|
|
+ #
|
|
|
+ # if "rows_to_drop" in inv_conf_per_entity[entity]:
|
|
|
+ # for row in inv_conf_per_entity[entity]["rows_to_drop"]:
|
|
|
+ # row_to_delete = df_entity.index[df_entity["category"] == row][0]
|
|
|
+ # df_entity = df_entity.drop(index=row_to_delete)
|
|
|
+ #
|
|
|
+ # df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
|
|
|
+ # inv_conf_per_entity[entity]["cat_codes_manual"]
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_entity.loc[:, "category"] = df_entity["category"].str.replace(
|
|
|
+ # inv_conf["cat_code_regexp"], repl, regex=True
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # df_entity = df_entity.drop(
|
|
|
+ # columns=inv_conf_per_entity[entity]["columns_to_drop"]
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # for year in inv_conf_per_entity[entity]["years"]:
|
|
|
+ # df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
|
|
|
+ #
|
|
|
+ # if df_trend is None:
|
|
|
+ # df_trend = df_entity
|
|
|
+ # else:
|
|
|
+ # df_trend = pd.concat(
|
|
|
+ # [df_trend, df_entity],
|
|
|
+ # axis=0,
|
|
|
+ # join="outer",
|
|
|
+ # ).reset_index(drop=True)
|
|
|
+ #
|
|
|
+ # ### convert to interchange format ###
|
|
|
+ # df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
|
|
|
+ # data_wide=df_trend,
|
|
|
+ # coords_cols=coords_cols,
|
|
|
+ # coords_defaults=coords_defaults,
|
|
|
+ # coords_terminologies=coords_terminologies,
|
|
|
+ # coords_value_mapping=coords_value_mapping,
|
|
|
+ # # filter_remove=filter_remove,
|
|
|
+ # meta_data=meta_data,
|
|
|
+ # convert_str=True,
|
|
|
+ # time_format="%Y",
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # ### convert to primap2 format ###
|
|
|
+ # print("Converting to primap2 format.")
|
|
|
+ # data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
|
|
|
|
|
|
# ###
|
|
|
# 3. Read in aggregated tables from 1990 - 2020
|
|
|
# ###
|
|
|
|
|
|
# Work in progress
|
|
|
+ # noinspection PyInterpreter
|
|
|
inv_conf_per_sector = {
|
|
|
- "all": {
|
|
|
+ "total": {
|
|
|
"page_defs": {
|
|
|
"32": {
|
|
|
"area": ["64,649,547,106"],
|
|
@@ -292,9 +287,9 @@ if __name__ == "__main__":
|
|
|
},
|
|
|
},
|
|
|
"entity": "KYOTOGHG (SARGWP100)",
|
|
|
- "category_column": "Categories",
|
|
|
- "columns_to_drop": ["Categories"],
|
|
|
- "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
|
|
|
+ # "category_column": "Categories",
|
|
|
+ # "columns_to_drop": ["Categories"],
|
|
|
+ # "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
|
|
|
"unit": "Gg CO2e",
|
|
|
"last_year": "2020",
|
|
|
"rows_to_fix": {
|
|
@@ -302,6 +297,8 @@ if __name__ == "__main__":
|
|
|
"Year",
|
|
|
],
|
|
|
},
|
|
|
+ "year_column": " Year ",
|
|
|
+ # TODO some categories are not recognized!
|
|
|
"cat_codes_manual": {
|
|
|
"Energy": "1",
|
|
|
"IPPU": "2",
|
|
@@ -312,110 +309,186 @@ if __name__ == "__main__":
|
|
|
"Total (incl. LULUCF)": "M.0",
|
|
|
},
|
|
|
},
|
|
|
+ "energy": {
|
|
|
+ "page_defs": {
|
|
|
+ "43": {
|
|
|
+ "area": ["59,478,544,79"],
|
|
|
+ "cols": ["97,160,220,262,338,388,452,502"],
|
|
|
+ },
|
|
|
+ "44": {
|
|
|
+ "area": ["60,773,546,582"],
|
|
|
+ "cols": ["103,165,226,274,329,384,444,494"],
|
|
|
+ },
|
|
|
+ },
|
|
|
+ "entity": "KYOTOGHG (SARGWP100)",
|
|
|
+ # "category_column" : "Categories",
|
|
|
+ # "columns_to_drop" : ["Categories"],
|
|
|
+ # "years" : ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
|
|
|
+ "unit": "Gg CO2e",
|
|
|
+ "last_year": "2020",
|
|
|
+ "rows_to_fix": {
|
|
|
+ 11: [
|
|
|
+ "Years",
|
|
|
+ ],
|
|
|
+ },
|
|
|
+ "rows_to_drop": [0, 2],
|
|
|
+ "year_column": "Years ",
|
|
|
+ "cat_codes_manual": {
|
|
|
+ r" 1.A.1.a.i Electricity generation ": "1.A.1.a.i",
|
|
|
+ r" 1.A.1.a.ii Combined heat and ipower peneration (CHP)": "1.A.1.a.ii", # noqa: E501
|
|
|
+ r" 1.A.1.c.ii Other energy ndustries ": "1.A.1.c.ii",
|
|
|
+ r"Manufacturing industries and construction ": "1.A.2",
|
|
|
+ r" 1.A.3.a 1 Civil aviation t ": "1.A.3.a",
|
|
|
+ r" .A.3.b Road ransportation ": "1.A.3.b",
|
|
|
+ r" 1.A.3.c Railways ": "1.A.3.c",
|
|
|
+ r" 1.A.3.e.ii Off-road ": "1.A.3.e.ii",
|
|
|
+ },
|
|
|
+ },
|
|
|
}
|
|
|
|
|
|
- page = [*inv_conf_per_sector["all"]["page_defs"]][0] # noqa: RUF015
|
|
|
- sector = "all"
|
|
|
-
|
|
|
- tables_inventory_original = camelot.read_pdf(
|
|
|
- str(input_folder / pdf_file),
|
|
|
- pages=page,
|
|
|
- table_areas=inv_conf_per_sector[sector]["page_defs"][page]["area"],
|
|
|
- columns=inv_conf_per_sector[sector]["page_defs"][page]["cols"],
|
|
|
- flavor="stream",
|
|
|
- split_text=True,
|
|
|
- )
|
|
|
-
|
|
|
- df_sector = tables_inventory_original[0].df
|
|
|
-
|
|
|
- last_row = df_sector.loc[df_sector[0] == "2020"].index[0]
|
|
|
+ df_agg = None
|
|
|
+ for sector in inv_conf_per_sector.keys():
|
|
|
+ print("-" * 60)
|
|
|
+ print(f"Reading sector {sector}.")
|
|
|
|
|
|
- df_sector = df_sector[0 : last_row + 1]
|
|
|
+ df_sector = None
|
|
|
|
|
|
- if "rows_to_fix" in inv_conf_per_sector[sector]:
|
|
|
- for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys():
|
|
|
- print(f"Merge content for {n_rows=}")
|
|
|
- df_sector = fix_rows(
|
|
|
- df_sector,
|
|
|
- rows_to_fix=inv_conf_per_sector[sector]["rows_to_fix"][n_rows],
|
|
|
- col_to_use=0,
|
|
|
- n_rows=n_rows,
|
|
|
+ for page in [*inv_conf_per_sector[sector]["page_defs"]]:
|
|
|
+ tables_inventory_original = camelot.read_pdf(
|
|
|
+ str(input_folder / pdf_file),
|
|
|
+ pages=page,
|
|
|
+ table_areas=inv_conf_per_sector[sector]["page_defs"][page]["area"],
|
|
|
+ columns=inv_conf_per_sector[sector]["page_defs"][page]["cols"],
|
|
|
+ flavor="stream",
|
|
|
+ split_text=True,
|
|
|
)
|
|
|
|
|
|
- df_sector = df_sector.set_index(0)
|
|
|
-
|
|
|
- # transpose so categegories are in first columns
|
|
|
- df_sector = df_sector.T
|
|
|
-
|
|
|
- df_sector = df_sector.rename(columns={" Year ": "category"})
|
|
|
-
|
|
|
- # unit is always Gg
|
|
|
- df_sector.loc[:, "unit"] = inv_conf_per_sector[sector]["unit"]
|
|
|
+ df_sector_page = tables_inventory_original[0].df
|
|
|
|
|
|
- # only one entity per table
|
|
|
- df_sector.loc[:, "entity"] = inv_conf_per_sector[sector]["entity"]
|
|
|
+ if df_sector is None:
|
|
|
+ df_sector = df_sector_page
|
|
|
+ else:
|
|
|
+ df_sector = pd.concat(
|
|
|
+ [df_sector, df_sector_page],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
|
|
|
- df_sector.loc[:, "category"] = df_sector.loc[:, "category"].replace(
|
|
|
- inv_conf_per_sector[sector]["cat_codes_manual"]
|
|
|
- )
|
|
|
+ print(f"adding table from page {page}.")
|
|
|
|
|
|
- # ###
|
|
|
- # Merge main and trend tables.
|
|
|
- # ###
|
|
|
+ last_row = df_sector.loc[df_sector[0] == "2020"].index[0]
|
|
|
|
|
|
- print("Merging main and trend table.")
|
|
|
- data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
|
|
|
+ df_sector = df_sector[0 : last_row + 1]
|
|
|
|
|
|
- # ###
|
|
|
- # Save raw data to IF and native format.
|
|
|
- # ###
|
|
|
+ if "rows_to_fix" in inv_conf_per_sector[sector]:
|
|
|
+ for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys():
|
|
|
+ print(f"Merge content for {n_rows=}")
|
|
|
+ df_sector = fix_rows(
|
|
|
+ df_sector,
|
|
|
+ rows_to_fix=inv_conf_per_sector[sector]["rows_to_fix"][n_rows],
|
|
|
+ col_to_use=0,
|
|
|
+ n_rows=n_rows,
|
|
|
+ )
|
|
|
|
|
|
- data_if = data_pm2.pr.to_interchange_format()
|
|
|
+ df_sector = df_sector.reset_index(drop=True)
|
|
|
|
|
|
- pm2.pm2io.write_interchange_format(
|
|
|
- output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
|
|
|
- data_if,
|
|
|
- )
|
|
|
+ if "rows_to_drop" in inv_conf_per_sector[sector]:
|
|
|
+ for row in inv_conf_per_sector[sector]["rows_to_drop"]:
|
|
|
+ df_sector = df_sector.drop(index=row)
|
|
|
|
|
|
- encoding = {var: compression for var in data_pm2.data_vars}
|
|
|
- data_pm2.pr.to_netcdf(
|
|
|
- output_folder
|
|
|
- / (output_filename + coords_terminologies["category"] + "_raw.nc"),
|
|
|
- encoding=encoding,
|
|
|
- )
|
|
|
+ df_sector = df_sector.set_index(0)
|
|
|
|
|
|
- # ###
|
|
|
- # Processing
|
|
|
- # ###
|
|
|
+ # transpose so categegories are in first columns
|
|
|
+ df_sector = df_sector.T
|
|
|
|
|
|
- data_proc_pm2 = process_data_for_country(
|
|
|
- data_country=data_pm2,
|
|
|
- entities_to_ignore=[],
|
|
|
- gas_baskets=gas_baskets,
|
|
|
- filter_dims=None,
|
|
|
- cat_terminology_out=None,
|
|
|
- category_conversion=None,
|
|
|
- sectors_out=None,
|
|
|
- processing_info_country=country_processing_step1,
|
|
|
- )
|
|
|
+ df_sector = df_sector.rename(
|
|
|
+ columns={inv_conf_per_sector[sector]["year_column"]: "category"}
|
|
|
+ )
|
|
|
|
|
|
- # ###
|
|
|
- # save processed data to IF and native format
|
|
|
- # ###
|
|
|
+ # unit is always Gg
|
|
|
+ df_sector.loc[:, "unit"] = inv_conf_per_sector[sector]["unit"]
|
|
|
|
|
|
- terminology_proc = coords_terminologies["category"]
|
|
|
+ # only one entity per table
|
|
|
+ df_sector.loc[:, "entity"] = inv_conf_per_sector[sector]["entity"]
|
|
|
|
|
|
- data_proc_if = data_proc_pm2.pr.to_interchange_format()
|
|
|
+ df_sector["category"] = df_sector["category"].str.replace("\n", "")
|
|
|
|
|
|
- if not output_folder.exists():
|
|
|
- output_folder.mkdir()
|
|
|
- pm2.pm2io.write_interchange_format(
|
|
|
- output_folder / (output_filename + terminology_proc), data_proc_if
|
|
|
- )
|
|
|
+ df_sector.loc[:, "category"] = df_sector.loc[:, "category"].replace(
|
|
|
+ inv_conf_per_sector[sector]["cat_codes_manual"]
|
|
|
+ )
|
|
|
|
|
|
- encoding = {var: compression for var in data_proc_pm2.data_vars}
|
|
|
- data_proc_pm2.pr.to_netcdf(
|
|
|
- output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
|
|
|
- )
|
|
|
+ if df_sector is None:
|
|
|
+ df_agg = df_sector
|
|
|
+ else:
|
|
|
+ df_agg = pd.concat(
|
|
|
+ [df_agg, df_sector],
|
|
|
+ axis=0,
|
|
|
+ join="outer",
|
|
|
+ ).reset_index(drop=True)
|
|
|
|
|
|
- print("Saved processed data.")
|
|
|
+ for year in [str(y) for y in range(1990, 2021)]:
|
|
|
+ df_agg.loc[:, year] = df_agg[year].str.replace(",", "")
|
|
|
+
|
|
|
+ print(df_agg)
|
|
|
+
|
|
|
+ # # ###
|
|
|
+ # # Merge main and trend tables.
|
|
|
+ # # ###
|
|
|
+ #
|
|
|
+ # print("Merging main and trend table.")
|
|
|
+ # data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
|
|
|
+ #
|
|
|
+ # # ###
|
|
|
+ # # Save raw data to IF and native format.
|
|
|
+ # # ###
|
|
|
+ #
|
|
|
+ # data_if = data_pm2.pr.to_interchange_format()
|
|
|
+ #
|
|
|
+ # pm2.pm2io.write_interchange_format(
|
|
|
+ # output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
|
|
|
+ # data_if,
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # encoding = {var: compression for var in data_pm2.data_vars}
|
|
|
+ # data_pm2.pr.to_netcdf(
|
|
|
+ # output_folder
|
|
|
+ # / (output_filename + coords_terminologies["category"] + "_raw.nc"),
|
|
|
+ # encoding=encoding,
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # # ###
|
|
|
+ # # Processing
|
|
|
+ # # ###
|
|
|
+ #
|
|
|
+ # data_proc_pm2 = process_data_for_country(
|
|
|
+ # data_country=data_pm2,
|
|
|
+ # entities_to_ignore=[],
|
|
|
+ # gas_baskets=gas_baskets,
|
|
|
+ # filter_dims=None,
|
|
|
+ # cat_terminology_out=None,
|
|
|
+ # category_conversion=None,
|
|
|
+ # sectors_out=None,
|
|
|
+ # processing_info_country=country_processing_step1,
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # # ###
|
|
|
+ # # save processed data to IF and native format
|
|
|
+ # # ###
|
|
|
+ #
|
|
|
+ # terminology_proc = coords_terminologies["category"]
|
|
|
+ #
|
|
|
+ # data_proc_if = data_proc_pm2.pr.to_interchange_format()
|
|
|
+ #
|
|
|
+ # if not output_folder.exists():
|
|
|
+ # output_folder.mkdir()
|
|
|
+ # pm2.pm2io.write_interchange_format(
|
|
|
+ # output_folder / (output_filename + terminology_proc), data_proc_if
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # encoding = {var: compression for var in data_proc_pm2.data_vars}
|
|
|
+ # data_proc_pm2.pr.to_netcdf(
|
|
|
+ # output_folder / (output_filename + terminology_proc + ".nc"),
|
|
|
+ # encoding=encoding
|
|
|
+ # )
|
|
|
+ #
|
|
|
+ # print("Saved processed data.")
|