9 månader sedan · bdec734a9e
--- a/src/unfccc_ghg_data/unfccc_reader/Bangladesh/config_bgd_bur1.py
+++ b/src/unfccc_ghg_data/unfccc_reader/Bangladesh/config_bgd_bur1.py
@@ -0,0 +1,25 @@
 
				+"""
			
 
				+Configuration file to read Bangladesh's BUR 1.
			
 
				+"""
			
 
				+coords_terminologies = {
			
 
				+    "area": "ISO3",
			
 
				+    "category": "IPCC2006_PRIMAP",
			
 
				+    "scenario": "PRIMAP",
			
 
				+}
			
 
				+
			
 
				+inv_conf_per_year = {
			
 
				+    "2013": {
			
 
				+        "page_defs": {
			
 
				+            "207": {
			
 
				+                "area": ["60,630,534,79"],
			
 
				+                "cols": ["387,444,495"],
			
 
				+                "skip_rows": 0,
			
 
				+            },
			
 
				+            "208": {
			
 
				+                "area": ["65,687,527,120"],
			
 
				+                "cols": ["380,437,491"],
			
 
				+                "skip_rows": 5,
			
 
				+            },
			
 
				+        },
			
 
				+    },
			
 
				+}
			
--- a/src/unfccc_ghg_data/unfccc_reader/Bangladesh/read_BGD_BUR1_from_pdf.py
+++ b/src/unfccc_ghg_data/unfccc_reader/Bangladesh/read_BGD_BUR1_from_pdf.py
@@ -1,3 +1,145 @@
 
				 """
			
 
				-Read Burundi's BUR1 from pdf
			
 
				+Read Bangladesh's BUR1 from pdf
			
 
				 """
			
 
				+
			
 
				+import camelot
			
 
				+import pandas as pd
			
 
				+from config_bgd_bur1 import coords_terminologies, inv_conf_per_year
			
 
				+
			
 
				+from unfccc_ghg_data.helper import (
			
 
				+    downloaded_data_path,
			
 
				+    extracted_data_path,
			
 
				+)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # ###
			
 
				+    # configuration
			
 
				+    # ###
			
 
				+
			
 
				+    input_folder = downloaded_data_path / "UNFCCC" / "Bangladesh" / "BUR1"
			
 
				+    output_folder = extracted_data_path / "UNFCCC" / "Bangladesh"
			
 
				+
			
 
				+    if not output_folder.exists():
			
 
				+        output_folder.mkdir()
			
 
				+
			
 
				+    pdf_file = "Updated_BUR1_Report_15_11_2023.pdf"
			
 
				+    output_filename = "BGD_BUR1_2023_"
			
 
				+    category_column = f"category ({coords_terminologies['category']})"
			
 
				+    compression = dict(zlib=True, complevel=9)
			
 
				+
			
 
				+    # ###
			
 
				+    # 1. Read in main tables from the Annex
			
 
				+    # ###
			
 
				+
			
 
				+    df_year = None
			
 
				+    for year in inv_conf_per_year.keys():
			
 
				+        print("-" * 60)
			
 
				+        print(f"Reading year {year}.")
			
 
				+        print("-" * 60)
			
 
				+        df_year = None
			
 
				+        for page in inv_conf_per_year[year]["page_defs"].keys():
			
 
				+            print(f"Reading table from page {page}.")
			
 
				+            tables_inventory_original = camelot.read_pdf(
			
 
				+                str(input_folder / pdf_file),
			
 
				+                pages=page,
			
 
				+                table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
			
 
				+                columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
			
 
				+                flavor="stream",
			
 
				+                split_text=False,
			
 
				+            )
			
 
				+            print("Reading complete.")
			
 
				+
			
 
				+            df_page = tables_inventory_original[0].df
			
 
				+
			
 
				+            if df_year is None:
			
 
				+                df_year = df_page[
			
 
				+                    inv_conf_per_year[year]["page_defs"][page]["skip_rows"] :
			
 
				+                ]
			
 
				+            else:
			
 
				+                df_year = pd.concat(
			
 
				+                    [
			
 
				+                        df_year,
			
 
				+                        df_page[
			
 
				+                            inv_conf_per_year[year]["page_defs"][page]["skip_rows"] :
			
 
				+                        ],
			
 
				+                    ],
			
 
				+                    axis=0,
			
 
				+                    join="outer",
			
 
				+                ).reset_index(drop=True)
			
 
				+
			
 
				+        pass
			
 
				+        # # fix content that spreads across multiple rows
			
 
				+        # if "rows_to_fix" in inv_conf_per_year[year]:
			
 
				+        #     for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
			
 
				+        #         print(f"Merge content for {n_rows=}")
			
 
				+        #         df_year = fix_rows(
			
 
				+        #             df_year,
			
 
				+        #             rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
			
 
				+        #             col_to_use=0,
			
 
				+        #             n_rows=n_rows,
			
 
				+        #         )
			
 
				+        #
			
 
				+        # df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
			
 
				+        #
			
 
				+        # skip_rows = 11
			
 
				+        # df_year = pd.concat(
			
 
				+        #     [df_header, df_year[skip_rows:]], axis=0, join="outer"
			
 
				+        # ).reset_index(drop=True)
			
 
				+        #
			
 
				+        # df_year = pm2.pm2io.nir_add_unit_information(
			
 
				+        #     df_year,
			
 
				+        #     unit_row=inv_conf["unit_row"],
			
 
				+        #     entity_row=inv_conf["entity_row"],
			
 
				+        #     regexp_entity=".*",
			
 
				+        #     regexp_unit=".*",
			
 
				+        #     default_unit="Gg",
			
 
				+        # )
			
 
				+        #
			
 
				+        # print("Added unit information.")
			
 
				+        #
			
 
				+        # # set index
			
 
				+        # df_year = df_year.set_index(inv_conf["index_cols"])
			
 
				+        #
			
 
				+        # # convert to long format
			
 
				+        # df_year_long = pm2.pm2io.nir_convert_df_to_long(
			
 
				+        #     df_year, year, inv_conf["header_long"]
			
 
				+        # )
			
 
				+        #
			
 
				+        # # extract from tuple
			
 
				+        # df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
			
 
				+        #
			
 
				+        # # prep for conversion to PM2 IF and native format
			
 
				+        # # make a copy of the categories row
			
 
				+        # df_year_long["category"] = df_year_long["orig_cat_name"]
			
 
				+        #
			
 
				+        # # replace cat names by codes in col "category"
			
 
				+        # # first the manual replacements
			
 
				+        #
			
 
				+        # df_year_long["category"] = df_year_long["category"].replace(
			
 
				+        #     inv_conf["cat_codes_manual"]
			
 
				+        # )
			
 
				+        #
			
 
				+        # df_year_long["category"] = df_year_long["category"].str.replace(".", "")
			
 
				+        #
			
 
				+        # # then the regex replacements
			
 
				+        # df_year_long["category"] = df_year_long["category"].str.replace(
			
 
				+        #     inv_conf["cat_code_regexp"], repl, regex=True
			
 
				+        # )
			
 
				+        #
			
 
				+        # df_year_long = df_year_long.reset_index(drop=True)
			
 
				+        #
			
 
				+        # df_year_long["data"] = df_year_long["data"].str.replace(",", "")
			
 
				+        #
			
 
				+        # # make sure all col headers are str
			
 
				+        # df_year_long.columns = df_year_long.columns.map(str)
			
 
				+        #
			
 
				+        # df_year_long = df_year_long.drop(columns=["orig_cat_name"])
			
 
				+        #
			
 
				+        # if df_main is None:
			
 
				+        #     df_main = df_year_long
			
 
				+        # else:
			
 
				+        #     df_main = pd.concat(
			
 
				+        #         [df_main, df_year_long],
			
 
				+        #         axis=0,
			
 
				+        #         join="outer",
			
 
				+        #     ).reset_index(drop=True)