Browse Source

first table

Daniel Busch 9 tháng trước cách đây
mục cha
commit
8bbbb22cf4

+ 23 - 3
src/unfccc_ghg_data/unfccc_reader/Bangladesh/config_bgd_bur1.py

@@ -13,13 +13,33 @@ inv_conf_per_year = {
             "207": {
                 "area": ["60,630,534,79"],
                 "cols": ["387,444,495"],
-                "skip_rows": 0,
+                "skip_rows_start": 0,
+                "skip_rows_end": 0,
             },
             "208": {
-                "area": ["65,687,527,120"],
+                "area": ["63,720,527,120"],
                 "cols": ["380,437,491"],
-                "skip_rows": 5,
+                "skip_rows_start": 8,
+                "skip_rows_end": 6,
             },
         },
+        "rows_to_fix": {
+            -2: [
+                "ch4 emission from rice field",
+                "indirect nitrous oxide (n2o) from n based fertilizer",
+                "Direct nitrous oxide (n2o) emissions from fertilizer application",
+                "Total enteric ch4 emissions",
+                "Total Manure ch4 emissions",
+                "Total Direct n2o emissions from manure system",
+                "Total indirect n2o emissions - Volatilization",
+                "Total indirect n2o emissions - leaching/Runoff",
+            ],
+            3: [
+                "3 - GHG Emissions Agriculture, Livestock & Forest and Other Land -Use"
+            ],
+            -3: [
+                "Greenhouse gas source and sink categories",
+            ],
+        },
     },
 }

+ 28 - 17
src/unfccc_ghg_data/unfccc_reader/Bangladesh/read_BGD_BUR1_from_pdf.py

@@ -9,6 +9,7 @@ from config_bgd_bur1 import coords_terminologies, inv_conf_per_year
 from unfccc_ghg_data.helper import (
     downloaded_data_path,
     extracted_data_path,
+    fix_rows,
 )
 
 if __name__ == "__main__":
@@ -39,6 +40,8 @@ if __name__ == "__main__":
         df_year = None
         for page in inv_conf_per_year[year]["page_defs"].keys():
             print(f"Reading table from page {page}.")
+
+            # read from PDF
             tables_inventory_original = camelot.read_pdf(
                 str(input_folder / pdf_file),
                 pages=page,
@@ -51,34 +54,42 @@ if __name__ == "__main__":
 
             df_page = tables_inventory_original[0].df
 
+            # cut rows at the top if needed
+            skip_rows_start = inv_conf_per_year[year]["page_defs"][page][
+                "skip_rows_start"
+            ]
+            if not skip_rows_start == 0:
+                df_page = df_page[skip_rows_start:]
+
+            # cut rows at the bottom if needed
+            skip_rows_end = inv_conf_per_year[year]["page_defs"][page]["skip_rows_end"]
+            if not skip_rows_end == 0:
+                df_page = df_page[:-skip_rows_end]
+
+            # stack the tables vertically
             if df_year is None:
-                df_year = df_page[
-                    inv_conf_per_year[year]["page_defs"][page]["skip_rows"] :
-                ]
+                df_year = df_page
             else:
                 df_year = pd.concat(
                     [
                         df_year,
-                        df_page[
-                            inv_conf_per_year[year]["page_defs"][page]["skip_rows"] :
-                        ],
+                        df_page,
                     ],
                     axis=0,
                     join="outer",
                 ).reset_index(drop=True)
 
+        # fix content that spreads across multiple rows
+        if "rows_to_fix" in inv_conf_per_year[year]:
+            for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
+                print(f"Merge content for {n_rows=}")
+                df_year = fix_rows(
+                    df_year,
+                    rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
+                    col_to_use=0,
+                    n_rows=n_rows,
+                )
         pass
-        # # fix content that spreads across multiple rows
-        # if "rows_to_fix" in inv_conf_per_year[year]:
-        #     for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
-        #         print(f"Merge content for {n_rows=}")
-        #         df_year = fix_rows(
-        #             df_year,
-        #             rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
-        #             col_to_use=0,
-        #             n_rows=n_rows,
-        #         )
-        #
         # df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
         #
         # skip_rows = 11