Daniel Busch 9 месяцев назад
Родитель
Сommit
789e13eef2
1 измененных файлов с 38 добавлено и 430 удалено
  1. 38 430
      src/unfccc_ghg_data/unfccc_reader/Mongolia/read_MNG_BUR2_from_pdf.py

+ 38 - 430
src/unfccc_ghg_data/unfccc_reader/Mongolia/read_MNG_BUR2_from_pdf.py

@@ -16,6 +16,8 @@ from config_mng_bur2 import (  # noqa: E402
     coords_defaults,
     coords_terminologies,
     coords_value_mapping,
+    inv_conf_harvested_wood_products,
+    inv_conf_per_sector,
     meta_data,
 )
 
@@ -25,7 +27,7 @@ from unfccc_ghg_data.helper import (  # noqa: E402
     fix_rows,
 )
 
-if __name__ == "__main__":
+if __name__ == "__main__" :
     # ###
     # configuration
     # ###
@@ -33,7 +35,7 @@ if __name__ == "__main__":
     input_folder = downloaded_data_path / "UNFCCC" / "Mongolia" / "BUR2"
     output_folder = extracted_data_path / "UNFCCC" / "Mongolia"
 
-    if not output_folder.exists():
+    if not output_folder.exists() :
         output_folder.mkdir()
 
     pdf_file = "20231112_NIR_MGL.pdf"
@@ -41,9 +43,11 @@ if __name__ == "__main__":
     category_column = f"category ({coords_terminologies['category']})"
     compression = dict(zlib=True, complevel=9)
 
-    def repl(m):  # noqa: D103
+
+    def repl(m) :  # noqa: D103
         return m.group("code")
 
+
     # ###
     # 1. Read in main tables
     # ###
@@ -284,56 +288,9 @@ if __name__ == "__main__":
     # The table for harvested wood products is in a different format
     # and needs to be read in separately.
 
-    inv_conf_harvested_wood_products = {
-        'page' : '151',
-        "category_column" : 'Categories',
-        "cat_codes_manual" : {
-            'GHG emission' : '3.D.1',
-        },
-        'unit' : 'Gg',
-        'entity' : 'CO2',
-        'parts' : {
-            "part_1" : {
-                "page_defs" :
-                    {
-                        "area" : ["52,690,555,647"],
-                        "cols" : ["101,149,196,231,268,310,351,398,433,476,514"],
-                    },
-                "rows_to_fix" : {
-                    3 : [
-                        "GHG",
-                    ],
-                },
-            },
-            "part_2" : {
-                "page_defs" :
-                    {
-                        "area" : ["52,637,555,596"],
-                        "cols" : ["99,150,197,239,281,326,372,425,469,516"],
-                    },
-                "rows_to_fix" : {
-                    3 : [
-                        "GHG",
-                    ],
-                },
-            },
-            "part_3" : {
-                "page_defs" :
-                    {
-                        "area" : ["52,591,550,547"],
-                        "cols" : ["106,156,197,239,281,326,372,420,465,509"],
-                    },
-                "rows_to_fix" : {
-                    3 : [
-                        "GHG",
-                    ],
-                },
-            }},
-    }
-
     print("-" * 60)
     print(
-        f"Reading sector harvested wood products table."
+        "Reading sector harvested wood products table."
     )
 
     df_hwp = None
@@ -349,8 +306,9 @@ if __name__ == "__main__":
 
         df_hwp_part = tables_inventory_original[0].df
 
-        if "rows_to_fix" in inv_conf_harvested_wood_products['parts'][part]:
-            for n_rows in inv_conf_harvested_wood_products['parts'][part]["rows_to_fix"].keys():
+        if "rows_to_fix" in inv_conf_harvested_wood_products['parts'][part] :
+            for n_rows in inv_conf_harvested_wood_products[
+                'parts'][part]["rows_to_fix"].keys():
                 df_hwp_part = fix_rows(
                     df_hwp_part,
                     rows_to_fix=inv_conf_harvested_wood_products['parts'][part]["rows_to_fix"][n_rows],
@@ -379,7 +337,6 @@ if __name__ == "__main__":
         inv_conf_harvested_wood_products["cat_codes_manual"]
     )
 
-
     # unit is always the same
     df_hwp.loc[:, "unit"] = inv_conf_harvested_wood_products["unit"]
 
@@ -390,364 +347,9 @@ if __name__ == "__main__":
     # 3. Read in aggregated tables from 1990 - 2020
     # ###
 
-    inv_conf_per_sector = {
-        "total": {
-            "page_defs": {
-                "32": {
-                    "area": ["64,649,547,106"],
-                    "cols": ["106,182,237,294,345,403,480"],
-                },
-            },
-            "entity": "KYOTOGHG (SARGWP100)",
-            # "category_column": "Categories",
-            # "columns_to_drop": ["Categories"],
-            # "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
-            "unit": "Gg CO2e",
-            "last_year": "2020",
-            "rows_to_fix": {
-                -3: [
-                    "Year",
-                ],
-            },
-            "year_column": " Year ",
-            # TODO some categories are not recognized!
-            "cat_codes_manual": {
-                " Energy ": "1",
-                " IPPU ": "2",
-                " Agriculture ": "3",
-                " Waste ": "4",
-                " LULUCF ": "M.LULUCF",
-                "Total (excl. LULUCF)": "M.0.EL",
-                "Total (incl. LULUCF)": "M.0",
-            },
-        },
-        "energy": {
-            "page_defs": {
-                "43": {
-                    "area": ["59,478,544,79"],
-                    "cols": ["97,160,220,262,338,388,452,502"],
-                },
-                "44": {
-                    "area": ["60,773,546,582"],
-                    "cols": ["103,165,226,274,329,384,444,494"],
-                },
-            },
-            "entity": "KYOTOGHG (SARGWP100)",
-            # "category_column" : "Categories",
-            # "columns_to_drop" : ["Categories"],
-            # "years" : ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
-            "unit": "Gg CO2e",
-            "last_year": "2020",
-            "rows_to_fix": {
-                11: [
-                    "Years",
-                ],
-            },
-            "rows_to_drop": [0, 2],
-            "year_column": "Years     ",
-            "cat_codes_manual": {
-                r" 1.A.1.a.i Electricity  generation  ": "1.A.1.a.i",
-                r" 1.A.1.a.ii  Combined  heat and ipower peneration (CHP)": "1.A.1.a.ii",  # noqa: E501
-                r" 1.A.1.c.ii  Other  energy ndustries ": "1.A.1.c.ii",
-                r"Manufacturing industries and  construction   ": "1.A.2",
-                r" 1.A.3.a 1 Civil  aviation t  ": "1.A.3.a",
-                r" .A.3.b Road  ransportation  ": "1.A.3.b",
-                r" 1.A.3.c Railways    ": "1.A.3.c",
-                r" 1.A.3.e.ii  Off-road   ": "1.A.3.e.ii",
-            },
-        },
-        "energy cont": {
-            "page_defs": {
-                "44": {
-                    "area": ["59,552,553,84"],
-                    "cols": ["103,173,219,274,330,382,443,494"],
-                },
-            },
-            "entity": "KYOTOGHG (SARGWP100)",
-            # "category_column" : "Categories",
-            # "columns_to_drop" : ["Categories"],
-            # "years" : ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
-            "unit": "Gg CO2e",
-            "last_year": "2020",
-            "rows_to_fix": {
-                8: [
-                    "Years",
-                ],
-            },
-            "rows_to_drop": [0, 2],
-            "year_column": "Years    ",
-            "cat_codes_manual": {
-                "Other sectors 1.A.4.a Commercial/ Institutional  ": "1.A.4.a",
-                " 1.A.4.b Residen-tial  ": "1.A.4.b",
-                " 1.A.4.c.i Agriculture -Stationary  ": "1.A.4.c.i",
-                " 1.A.4.c.ii Agriculture -Off-road vehicles and other machinery": "1.A.4.c.ii",  # noqa: E501
-                "Non-specified 1.A.5.a Stationary  ": "1.A.5.a",
-                "Fugitive emis 1.B.1.a Coal mining & handling (surface mining) ": "1.B.1.a",  # noqa: E501
-                "sions from fu 1.B.2.a.ii Oil -Flaring  ": "1.B.2.a.ii",
-                "els 1.B.2.a.iii.2 Oil production and upgrading ": "1.B.2.a.iii",
-            },
-        },
-        "ippu": {
-            "page_defs": {
-                "74": {
-                    "area": ["68,701,544,313"],
-                    "cols": ["97,188,261,358,462"],
-                },
-            },
-            "entity": "KYOTOGHG (SARGWP100)",
-            # "category_column" : "Categories",
-            # "columns_to_drop" : ["Categories"],
-            # "years" : ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
-            "unit": "Gg CO2e",
-            "last_year": "2020",
-            "rows_to_fix": {
-                3: [
-                    "Year",
-                ],
-            },
-            "year_column": "Year ",
-            "cat_codes_manual": {
-                "2.A-Mineral industry ": "2.A",
-                "2.C-Metal industry ": "2.C",
-                "2.D-Non-energy products from fuels and solvent use": "2.D",
-                "2.F-Product uses as substitutes for ozone depleting substances": "2.F",
-                "2. IPPU Total ": "2",
-            },
-            "remove_duplicates": ["2"],
-        },
-        "livestock": {
-            "page_defs": {
-                "103": {
-                    "area": ["62,480,544,82"],
-                    "cols": ["97,182,259,326,403,474"],
-                },
-            },
-            # "entity": "KYOTOGHG (SARGWP100)",
-            # "category_column" : "Categories",
-            # "columns_to_drop" : ["Categories"],
-            # "years" : ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
-            "unit": "Gg CO2e",
-            "last_year": "2020",
-            "rows_to_fix": {
-                3: [
-                    "Year",
-                ],
-            },
-            "rows_to_drop": [0, 1],
-            "year_column": "Year ",
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual": {
-                "Fermentation Gg": "3.A.1",
-                "Management CH4": "3.A.2",
-                " (Total CH4) ": "3.A",
-                "Fermentation Gg C": "3.A.1",
-                "Management O2e": "3.A.2",
-                " (Gg CO2e) ": "3.A",
-            },
-            "multi_entity": {
-                "unit": ["Gg", "Gg", "Gg", "Gg CO2e", "Gg CO2e", "Gg CO2e"],
-                "entity": [
-                    "CH4",
-                    "CH4",
-                    "CH4",
-                    "KYOTOGHG (SARGWP100)",
-                    "KYOTOGHG (SARGWP100)",
-                    "KYOTOGHG (SARGWP100)",
-                ],
-            },
-        },
-        "biomass_burning": {
-            "page_defs": {
-                "114": {
-                    "area": ["70,214,544,78"],
-                    "cols": ["116,185,239,304,365,426,491"],
-                },
-                "115": {
-                    "area": ["72,777,545,505"],
-                    "cols": ["123,190,250,313,374,438,495"],
-                },
-            },
-            "last_year": "2020",
-            "col_to_use": 5,
-            "rows_to_fix": {
-                7: [
-                    "3.C.1 - Emiss",
-                ],
-            },
-            "year_column": "  Year  ",
-            # TODO: These categories are technically duplicate, just with a different unit
-            "categories_to_drop": [
-                "3.C.1 -Emiss  CH4 (Gg CO2e)",
-                "ions from bioma (CO2e) N2O (Gg CO2e)",
-                "ss burning  Total (Gg CO2e)",
-            ],
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual": {
-                " 3.C.1  CH4 (Gg) ": "3.C.1",
-                " -Emissions fr  N2O (Gg) ": "3.C.1",
-                " om biomass bur  NOx (Gg) ": "3.C.1",
-                " ning  CO(Gg) ": "3.C.1",
-            },
-            "multi_entity": {
-                "unit": ["Gg", "Gg", "Gg", "Gg"],
-                "entity": [
-                    "CH4",
-                    "N2O",
-                    "NOx",
-                    "CO",
-                ],
-            },
-        },
-        "managed_soils_direct": {
-            "page_defs": {
-                "119": {
-                    "area": ["70,600,541,173"],
-                    "cols": ["114,191,245,328,400,476"],
-                },
-            },
-            "last_year": "2020",
-            "col_to_use": 3,
-            "rows_to_fix": {
-                10: [
-                    "Urine and dung",
-                ],
-            },
-            "year_column": "  Year   ",
-            # # TODO: technically duplicate, just with a different unit
-            "categories_to_drop": [
-                " 3.C.4 -Direct N2O Emissions from managed soils (CO2e) Gg CO2e",
-            ],
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual": {
-                # TODO the next 4 categories are made up placeholders
-                " Inorganic N fertilizer application  N2O (Gg)": "3.C.4.i",
-                " Organic N applied as fertilizer (manure) N2O (Gg)": "3.C.4.ii",
-                "Urine and dung N deposited on pasture, range and paddock by grazing animals N2O (Gg)": "3.C.4.iii",
-                "  N in crop residues  N2O (Gg)": "3.C.4.iiii",
-                " 3.C.4 -Direct N2O Emissions from managed soils N2O (Gg)": "3.C.4",
-            },
-            "multi_entity": {
-                "unit": ["Gg", "Gg", "Gg", "Gg", "Gg"],
-                "entity": [
-                    "N2O",
-                    "N2O",
-                    "N2O",
-                    "N2O",
-                    "N2O",
-                ],
-            },
-        },
-        "managed_soils_indirect": {
-            "page_defs": {
-                "125": {
-                    "area": ["74,214,539,83"],
-                    "cols": ["125,222,309,423"],
-                },
-                "126": {
-                    "area": ["72,775,539,369"],
-                    "cols": ["148,248,351,459"],
-                },
-            },
-            "last_year": "2020",
-            "col_to_use": 3,
-            "rows_to_fix": {
-                7: [
-                    "3.C.5 - Indirect N2O",
-                ],
-            },
-            "year_column": "  Year  ",
-            # # TODO: technically duplicate, just with a different unit
-            "categories_to_drop": [
-                "3.C.5 -Indirect N2O emissions from managed  soils Gg CO2e"
-            ],
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual": {
-                # TODO the next 2 categories are made up placeholders
-                " Volatilization  pathway Gg N2O": "3.C.5.i",
-                " Leaching/runoff  pathway Gg N2O": "3.C.5.ii",
-                "3.C.5 -Indirect N2O emissions from managed  soils Gg N2O": "3.C.5",
-            },
-            "entity": "N2O",
-            "unit": "Gg",
-        },
-        "bio_waste" : {
-            "page_defs" : {
-                "157" : {
-                    "area" : ["68,748,541,228"],
-                    "cols" : ["108,176,222,283,332,387,429"],
-                },
-            },
-            "last_year" : "2020",
-            "rows_to_fix" : {
-                2 : [
-                    "Year",
-                ],
-            },
-            "year_column" : 'Year ',
-            # # TODO: technically duplicate, just with a different unit
-            "categories_to_drop" : [
-                'Total emissions from SWDS Gg CO2e'
-            ],
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual" : {
-                # TODO the categories are made up placeholders
-                'Food ' : "4.A.1.food",
-                 'Garden ' : "4.A.1.garden",
-                 'Paper Gg CH4' : "4.A.1.paper",
-                 'Wood ' : "4.A.1.wood",
-                 'Textile ' : "4.A.1.textile",
-                 'Total ' : "4.A.1.",
-            },
-            "entity" : "CH4 ",
-            "unit" : "Gg",
-        },
-        "wastewater" : {
-            "page_defs" : {
-                "161" : {
-                    "area" : ["60,480,541,85"],
-                    "cols" : ["98,165,226,281,340,408,465"],
-                },
-                "162" : {
-                    "area" : ["62,775,541,613"],
-                    "cols" : ["110,176,229,288,349,414,486"],
-                },
-            },
-            "last_year" : "2020",
-            "col_to_use" : 7,
-            "rows_to_fix" : {
-                10 : [
-                    "Wastewater",
-                ],
-            },
-            "year_column" : '   Year  ',
-            # # TODO: technically duplicate, just with a different unit
-            "categories_to_drop" : [
-                ' Domestic wastewater  CH4 emissions ',
-                ' Domestic wastewater  N2O emissions (Gg C',
-                ' Industrial wastewater  CH4 emissions O2 e)',
-                'Wastewater treatment and discharge  Total emissions ',
-                ],
-            # TODO: This is far from than the actual categories but works for now
-            "cat_codes_manual" : {
-                # TODO the categories are made up placeholders
-                ' Domestic wastewater  CH4 emissions (Gg CH4)' : '4.D.1',
-                ' Domestic wastewater  N2O emissions (Gg N2O)' : '4.D.1',
-                ' Industrial wastewater  CH4 emissions (Gg CH4)' : '4.D.2',
-            },
-            "multi_entity" : {
-                "unit" : ["Gg", "Gg", "Gg"],
-                "entity" : [
-                    "CH4",
-                    "N2O",
-                    "CH4",
-                ],
-            },
-        }
-    }
-
     df_agg = None
     # TODO remove `reversed` (only for development)
-    for sector in list(reversed(list(inv_conf_per_sector.keys()))):
+    for sector in list(reversed(list(inv_conf_per_sector.keys()))) :
         print("-" * 60)
         print(
             f"Reading sector {sector} on page(s) {[*inv_conf_per_sector[sector]['page_defs']]}."
@@ -755,7 +357,7 @@ if __name__ == "__main__":
 
         df_sector = None
 
-        for page in [*inv_conf_per_sector[sector]["page_defs"]]:
+        for page in [*inv_conf_per_sector[sector]["page_defs"]] :
             tables_inventory_original = camelot.read_pdf(
                 str(input_folder / pdf_file),
                 pages=page,
@@ -767,9 +369,9 @@ if __name__ == "__main__":
 
             df_sector_page = tables_inventory_original[0].df
 
-            if df_sector is None:
+            if df_sector is None :
                 df_sector = df_sector_page
-            else:
+            else :
                 df_sector = pd.concat(
                     [df_sector, df_sector_page],
                     axis=0,
@@ -782,13 +384,13 @@ if __name__ == "__main__":
 
         df_sector = df_sector[0 : last_row + 1]
 
-        if "rows_to_fix" in inv_conf_per_sector[sector]:
-            for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys():
+        if "rows_to_fix" in inv_conf_per_sector[sector] :
+            for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys() :
                 print(f"Merge content for {n_rows=}")
                 # set the row
-                if "col_to_use" in inv_conf_per_sector[sector].keys():
+                if "col_to_use" in inv_conf_per_sector[sector].keys() :
                     col_to_use = inv_conf_per_sector[sector]["col_to_use"]
-                else:
+                else :
                     col_to_use = 0
                 df_sector = fix_rows(
                     df_sector,
@@ -799,8 +401,8 @@ if __name__ == "__main__":
 
         df_sector = df_sector.reset_index(drop=True)
 
-        if "rows_to_drop" in inv_conf_per_sector[sector]:
-            for row in inv_conf_per_sector[sector]["rows_to_drop"]:
+        if "rows_to_drop" in inv_conf_per_sector[sector] :
+            for row in inv_conf_per_sector[sector]["rows_to_drop"] :
                 df_sector = df_sector.drop(index=row)
 
         # TODO: Is it necessary to set the index here?
@@ -810,14 +412,14 @@ if __name__ == "__main__":
         df_sector = df_sector.T
 
         df_sector = df_sector.rename(
-            columns={inv_conf_per_sector[sector]["year_column"]: "category"}
+            columns={inv_conf_per_sector[sector]["year_column"] : "category"}
         )
 
         df_sector["category"] = df_sector["category"].str.replace("\n", "")
 
         # TODO Is it not the same as remove categories further down?
-        if "categories_to_drop" in inv_conf_per_sector[sector]:
-            for row in inv_conf_per_sector[sector]["categories_to_drop"]:
+        if "categories_to_drop" in inv_conf_per_sector[sector] :
+            for row in inv_conf_per_sector[sector]["categories_to_drop"] :
                 row_to_delete = df_sector.index[df_sector["category"] == row][0]
                 df_sector = df_sector.drop(index=row_to_delete)
 
@@ -825,12 +427,12 @@ if __name__ == "__main__":
             inv_conf_per_sector[sector]["cat_codes_manual"]
         )
 
-        if "multi_entity" in inv_conf_per_sector[sector]:
+        if "multi_entity" in inv_conf_per_sector[sector] :
             df_sector["entity"] = inv_conf_per_sector[sector]["multi_entity"]["entity"]
             df_sector["unit"] = inv_conf_per_sector[sector]["multi_entity"]["unit"]
             # df_sector = df_sector.set_index(["entity", "unit", "category"])
 
-        else:
+        else :
             # unit is always the same
             df_sector.loc[:, "unit"] = inv_conf_per_sector[sector]["unit"]
 
@@ -839,25 +441,31 @@ if __name__ == "__main__":
 
         # Some categories are in two tables (summary and sector)
         # Duplicates need to be removed
-        if "remove_duplicates" in inv_conf_per_sector[sector]:
-            for row in inv_conf_per_sector[sector]["remove_duplicates"]:
+        if "remove_duplicates" in inv_conf_per_sector[sector] :
+            for row in inv_conf_per_sector[sector]["remove_duplicates"] :
                 row_to_delete = df_sector.index[df_sector["category"] == row][0]
                 df_sector = df_sector.drop(index=row_to_delete)
 
-        if df_agg is None:
+        if df_agg is None :
             df_agg = df_sector
-        else:
+        else :
             df_agg = pd.concat(
                 [df_agg, df_sector],
                 axis=0,
                 join="outer",
             ).reset_index(drop=True)
 
-        for year in [str(y) for y in range(1990, 2021)]:
+        for year in [str(y) for y in range(1990, 2021)] :
             df_agg.loc[:, year] = df_agg[year].str.replace(",", "")
 
         # print(df_agg)
-    pass
+
+    # add harvested wood products table and all the other sectors together
+    df_agg = pd.concat(
+        [df_agg, df_hwp],
+        axis=0,
+        join="outer",
+    ).reset_index(drop=True)
 
     ### convert to interchange format ###
     df_agg_IF = pm2.pm2io.convert_wide_dataframe_if(