Browse Source

Merge pull request #89 from JGuetschow/mongolia-BUR2-remaining-tables

crdanielbusch 5 months ago
parent
commit
8b01787b58

+ 1 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/Xw/Gk/MD5E-s357474--92ddf7a6128cdc935782fb0d4b3e22aa.csv/MD5E-s357474--92ddf7a6128cdc935782fb0d4b3e22aa.csv

+ 1 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/p2/1w/MD5E-s262809--265af905dd1358dc590c40468caa8b0a.nc/MD5E-s262809--265af905dd1358dc590c40468caa8b0a.nc

+ 25 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP.yaml

@@ -0,0 +1,25 @@
+attrs:
+  references: https://unfccc.int/documents/633382
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Mongolia. Biennial update report (BUR). BUR2 Processed on 2024-07-11 Processed
+    on 2024-07-11
+  comment: Read fom pdf by Daniel Busch Processed on 2024-07-11 Processed on 2024-07-11
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+  entity: KYOTOGHG
+  gwp_context: AR6GWP100
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - category (IPCC2006_PRIMAP)
+  - scenario (PRIMAP)
+  - area (ISO3)
+  - provenance
+  - entity
+  - unit
+data_file: MNG_BUR2_2023_IPCC2006_PRIMAP.csv

+ 1 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP_raw.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/6P/j1/MD5E-s149945--d429bb4261a70e28e57d8c50551ab8b2.csv/MD5E-s149945--d429bb4261a70e28e57d8c50551ab8b2.csv

+ 1 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP_raw.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/Vw/zK/MD5E-s132491--067ed21aa031e1a4c0fb58d26a9f667b.nc/MD5E-s132491--067ed21aa031e1a4c0fb58d26a9f667b.nc

+ 22 - 0
extracted_data/UNFCCC/Mongolia/MNG_BUR2_2023_IPCC2006_PRIMAP_raw.yaml

@@ -0,0 +1,22 @@
+attrs:
+  references: https://unfccc.int/documents/633382
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Mongolia. Biennial update report (BUR). BUR2
+  comment: Read fom pdf by Daniel Busch
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - category (IPCC2006_PRIMAP)
+  - scenario (PRIMAP)
+  - area (ISO3)
+  - provenance
+  - entity
+  - unit
+data_file: MNG_BUR2_2023_IPCC2006_PRIMAP_raw.csv

+ 454 - 36
src/unfccc_ghg_data/unfccc_reader/Mongolia/config_mng_bur2.py

@@ -1,6 +1,7 @@
 """
-Configuration for Mongolia BUR2
+Configuration file to read Mongolia's BUR 2.
 """
+
 coords_terminologies = {
     "area": "ISO3",
     "category": "IPCC2006_PRIMAP",
@@ -15,10 +16,10 @@ inv_conf = {
     "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
     "cat_codes_manual": {
         # remove whitespace at start of line
-        " 2.G.2 -SF6 and PFCs from Other Product Uses": "2.G.2 - SF6 and PFCs from Other Product Uses",
-        " 2.G.3 -N2O from Product Uses": "2.G.3 - N2O from Product Uses",
-        " 1.C.1 -Transport of CO2": "1.C.1 - Transport of CO2",
-        " 3.C.1 -Emissions from biomass burning ": "3.C.1",
+        "2.G.2 -SF6 and PFCs from Other Product Uses": "2.G.2 - SF6 and PFCs from Other Product Uses",
+        "2.G.3 -N2O from Product Uses": "2.G.3 - N2O from Product Uses",
+        "1.C.1 -Transport of CO2": "1.C.1 - Transport of CO2",
+        "3.C.1 -Emissions from biomass burning ": "3.C.1",
         "Memo Items (5)": "MEMO",
         "International Bunkers": "M.BK",
         "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
@@ -77,7 +78,7 @@ inv_conf_per_year = {
                 "1.A.3.a.i - International Aviation (International",
             ],
             -2: ["3.C.1 - Emissions from biomass burning"],
-            2: [" 3.C.1 -Emissions from biomass burning"],
+            2: ["3.C.1 -Emissions from biomass burning"],
         },
         "page_defs": {
             "176": {
@@ -97,6 +98,7 @@ inv_conf_per_year = {
                 "cols": ["287,328,365,410,449,482,540,600,636,675,721,750"],
             },
         },
+        "skip_rows": 11,
     },
     "2020": {
         "page_defs": {
@@ -137,6 +139,7 @@ inv_conf_per_year = {
                 "2.B.4 - Caprolactam. Glyoxal and Glyoxylic Acid",
             ],
         },
+        "skip_rows": 0,
     },
 }
 
@@ -180,8 +183,9 @@ inv_conf_per_entity = {
         },
         "cat_codes_manual": {"Total National Emissions (Gg CO2e)": "0"},
         "category_column": "Categories",
-        "columns_to_drop": ["Share, %", "Categories"],
-        "years": ["2007", "2010", "2015", "2020"],
+        # 2007 will break gas basket consistency check
+        "columns_to_drop": ["Share, %", "Categories", "2007"],
+        "years": ["2010", "2015", "2020"],
         "unit": "Gg CO2e",
     },
     "N2O": {
@@ -230,6 +234,7 @@ inv_conf_per_entity = {
         "columns_to_drop": ["Share, %", "Categories"],
         "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
         "unit": "Gg",
+        "del_value": [("1995", "4"), ("2005", "4")],
     },
     "CO2": {
         "page_defs": {
@@ -257,7 +262,7 @@ inv_conf_per_entity = {
             ],
             5: ["2.D - Non-Energy"],
             -2: [
-                "Categories ",
+                "Categories",
                 "Emissions and Removals (Gg CO2)",
             ],
         },
@@ -265,14 +270,404 @@ inv_conf_per_entity = {
             "Total National Emissions (Gg CO2)",
             "Total National Removals (Gg CO2)",
         ],
-        "columns_to_drop": ["Share, %", " Categories "],
+        "columns_to_drop": ["Share, %", "Categories"],
         "cat_codes_manual": {"Total National Emissions and Removals (Gg CO2)": "0"},
-        "category_column": " Categories ",
+        "category_column": "Categories",
         "years": ["1990", "1995", "2000", "2005", "2010", "2015", "2020"],
         "unit": "Gg",
     },
 }
 
+inv_conf_harvested_wood_products = {
+    "page": "151",
+    "category_column": "Categories",
+    "cat_codes_manual": {
+        "GHG emission": "3.D.1",
+    },
+    "unit": "Gg",
+    "entity": "CO2",
+    # Table consists of three stacked parts
+    "parts": {
+        "part_1": {
+            "page_defs": {
+                "area": ["52,690,555,647"],
+                "cols": ["101,149,196,231,268,310,351,398,433,476,514"],
+            },
+            "rows_to_fix": {
+                3: [
+                    "GHG",
+                ],
+            },
+        },
+        "part_2": {
+            "page_defs": {
+                "area": ["52,637,555,596"],
+                "cols": ["99,150,197,239,281,326,372,425,469,516"],
+            },
+            "rows_to_fix": {
+                3: [
+                    "GHG",
+                ],
+            },
+        },
+        "part_3": {
+            "page_defs": {
+                "area": ["52,591,550,547"],
+                "cols": ["106,156,197,239,281,326,372,420,465,509"],
+            },
+            "rows_to_fix": {
+                3: [
+                    "GHG",
+                ],
+            },
+        },
+    },
+}
+
+inv_conf_per_sector = {
+    "total": {
+        "page_defs": {
+            "32": {
+                "area": ["64,649,547,106"],
+                "cols": ["106,182,237,294,345,403,480"],
+            },
+        },
+        "entity": "KYOTOGHG (SARGWP100)",
+        "unit": "Gg CO2e",
+        "last_year": "2020",
+        "rows_to_fix": {
+            -3: [
+                "Year",
+            ],
+        },
+        "year_column": "Year",
+        "cat_codes_manual": {
+            "Energy": "1",
+            "IPPU": "2",
+            "Agriculture": "M.AG",
+            "Waste": "4",
+            "LULUCF": "M.LULUCF",
+            "Total (excl. LULUCF)": "M.0.EL",
+            "Total (incl. LULUCF)": "0",
+        },
+    },
+    "energy": {
+        "page_defs": {
+            "43": {
+                "area": ["59,478,544,79"],
+                "cols": ["97,160,220,262,338,388,452,502"],
+            },
+            "44": {
+                "area": ["60,773,546,582"],
+                "cols": ["103,165,226,274,329,384,444,494"],
+            },
+        },
+        "entity": "KYOTOGHG (SARGWP100)",
+        "unit": "Gg CO2e",
+        "last_year": "2020",
+        "rows_to_fix": {
+            11: [
+                "Years",
+            ],
+        },
+        "rows_to_drop": [0, 2],
+        "year_column": "Years",
+        "cat_codes_manual": {
+            "1.A.1.a.i Electricity  generation": "1.A.1.a.i",
+            "1.A.1.a.ii  Combined  heat and ipower peneration (CHP)": "1.A.1.a.ii",
+            "1.A.1.c.ii  Other  energy ndustries": "1.A.1.c.ii",
+            "Manufacturing industries and  construction": "1.A.2",
+            "1.A.3.a 1 Civil  aviation t": "1.A.3.a",
+            ".A.3.b Road  ransportation": "1.A.3.b",
+            "1.A.3.c Railways": "1.A.3.c",
+            "1.A.3.e.ii  Off-road": "1.A.3.e.ii",
+        },
+    },
+    "energy cont": {
+        "page_defs": {
+            "44": {
+                "area": ["59,552,553,84"],
+                "cols": ["103,173,219,274,330,382,443,494"],
+            },
+        },
+        "entity": "KYOTOGHG (SARGWP100)",
+        "unit": "Gg CO2e",
+        "last_year": "2020",
+        "rows_to_fix": {
+            8: [
+                "Years",
+            ],
+        },
+        "rows_to_drop": [0, 2],
+        "year_column": "Years",
+        "cat_codes_manual": {
+            "Other sectors 1.A.4.a Commercial/ Institutional": "1.A.4.a",
+            "1.A.4.b Residen-tial": "1.A.4.b",
+            "1.A.4.c.i Agriculture -Stationary": "1.A.4.c.i",
+            "1.A.4.c.ii Agriculture -Off-road vehicles and other machinery": "1.A.4.c.ii",
+            "Non-specified 1.A.5.a Stationary": "1.A.5.a",
+            "Fugitive emis 1.B.1.a Coal mining & handling (surface mining)": "1.B.1.a",
+            "sions from fu 1.B.2.a.ii Oil -Flaring": "1.B.2.a.ii",
+            "els 1.B.2.a.iii.2 Oil production and upgrading": "1.B.2.a.iii",
+        },
+    },
+    "ippu": {
+        "page_defs": {
+            "74": {
+                "area": ["68,701,544,313"],
+                "cols": ["97,188,261,358,462"],
+            },
+        },
+        "entity": "KYOTOGHG (SARGWP100)",
+        "unit": "Gg CO2e",
+        "last_year": "2020",
+        "rows_to_fix": {
+            3: [
+                "Year",
+            ],
+        },
+        "year_column": "Year",
+        "cat_codes_manual": {
+            "2.A-Mineral industry": "2.A",
+            "2.C-Metal industry": "2.C",
+            "2.D-Non-energy products from fuels and solvent use": "2.D",
+            "2.F-Product uses as substitutes for ozone depleting substances": "2.F",
+            "2. IPPU Total": "2",
+        },
+        "remove_duplicates": ["2"],
+    },
+    "livestock": {
+        "page_defs": {
+            "103": {
+                "area": ["62,480,544,82"],
+                "cols": ["97,182,259,326,403,474"],
+            },
+        },
+        "unit": "Gg CO2e",
+        "last_year": "2020",
+        "rows_to_fix": {
+            3: [
+                "Year",
+            ],
+        },
+        "rows_to_drop": [0, 1],
+        "year_column": "Year",
+        "cat_codes_manual": {
+            "Fermentation Gg": "3.A.1",
+            "Management CH4": "3.A.2",
+            "(Total CH4)": "3.A",
+            "Fermentation Gg C": "3.A.1",
+            "Management O2e": "3.A.2",
+            "(Gg CO2e)": "3.A",
+        },
+        "multi_entity": {
+            "unit": ["Gg", "Gg", "Gg", "Gg CO2e", "Gg CO2e", "Gg CO2e"],
+            "entity": [
+                "CH4",
+                "CH4",
+                "CH4",
+                "KYOTOGHG (SARGWP100)",
+                "KYOTOGHG (SARGWP100)",
+                "KYOTOGHG (SARGWP100)",
+            ],
+        },
+    },
+    "biomass_burning": {
+        "page_defs": {
+            "114": {
+                "area": ["70,214,544,78"],
+                "cols": ["116,185,239,304,365,426,491"],
+            },
+            "115": {
+                "area": ["72,777,545,505"],
+                "cols": ["123,190,250,313,374,438,495"],
+            },
+        },
+        "last_year": "2020",
+        "col_to_use": 5,
+        "rows_to_fix": {
+            7: [
+                "3.C.1 - Emiss",
+            ],
+        },
+        "year_column": "Year",
+        # TODO: These categories are technically duplicate, just with a different unit
+        "categories_to_drop": [
+            "3.C.1 -Emiss  CH4 (Gg CO2e)",
+            "ions from bioma (CO2e) N2O (Gg CO2e)",
+            "ss burning  Total (Gg CO2e)",
+        ],
+        "cat_codes_manual": {
+            "3.C.1  CH4 (Gg)": "3.C.1",
+            "-Emissions fr  N2O (Gg)": "3.C.1",
+            "om biomass bur  NOx (Gg)": "3.C.1",
+            "ning  CO(Gg)": "3.C.1",
+        },
+        "multi_entity": {
+            "unit": ["Gg", "Gg", "Gg", "Gg"],
+            "entity": [
+                "CH4",
+                "N2O",
+                "NOx",
+                "CO",
+            ],
+        },
+    },
+    "managed_soils_direct": {
+        "page_defs": {
+            "119": {
+                "area": ["70,600,541,173"],
+                "cols": ["114,191,245,328,400,476"],
+            },
+        },
+        "last_year": "2020",
+        "col_to_use": 3,
+        "rows_to_fix": {
+            10: [
+                "Urine and dung",
+            ],
+        },
+        "year_column": "Year",
+        # # TODO: technically duplicate, just with a different unit
+        "categories_to_drop": [
+            "3.C.4 -Direct N2O Emissions from managed soils (CO2e) Gg CO2e",
+            "Inorganic N fertilizer application  N2O (Gg)",
+            "Organic N applied as fertilizer (manure) N2O (Gg)",
+            "Urine and dung N deposited on pasture, range and paddock by grazing animals N2O (Gg)",
+            "N in crop residues  N2O (Gg)",
+        ],
+        "cat_codes_manual": {
+            # TODO the next 4 categories are made up placeholders
+            # "Inorganic N fertilizer application  N2O (Gg)": "3.C.4.i",
+            # "Organic N applied as fertilizer (manure) N2O (Gg)": "3.C.4.ii",
+            # "Urine and dung N deposited on pasture, range and paddock by grazing animals N2O (Gg)": "3.C.4.iii",
+            # "N in crop residues  N2O (Gg)": "3.C.4.iiii",
+            "3.C.4 -Direct N2O Emissions from managed soils N2O (Gg)": "3.C.4",
+        },
+        "entity": "N2O",
+        "unit": "Gg",
+        # "multi_entity": {
+        #     "unit": ["Gg", "Gg", "Gg", "Gg", "Gg"],
+        #     "entity": [
+        #         "N2O",
+        #         "N2O",
+        #         "N2O",
+        #         "N2O",
+        #         "N2O",
+        #     ],
+        # },
+    },
+    "managed_soils_indirect": {
+        "page_defs": {
+            "125": {
+                "area": ["74,214,539,83"],
+                "cols": ["125,222,309,423"],
+            },
+            "126": {
+                "area": ["72,775,539,369"],
+                "cols": ["148,248,351,459"],
+            },
+        },
+        "last_year": "2020",
+        "col_to_use": 3,
+        "rows_to_fix": {
+            7: [
+                "3.C.5 - Indirect N2O",
+            ],
+        },
+        "year_column": "Year",
+        # # TODO: technically duplicate, just with a different unit
+        "categories_to_drop": [
+            "3.C.5 -Indirect N2O emissions from managed  soils Gg CO2e",
+            "Volatilization  pathway Gg N2O",
+            "Leaching/runoff  pathway Gg N2O",
+        ],
+        "cat_codes_manual": {
+            # TODO the next 2 categories are made up placeholders
+            # "Volatilization  pathway Gg N2O": "3.C.5.i",
+            # "Leaching/runoff  pathway Gg N2O": "3.C.5.ii",
+            "3.C.5 -Indirect N2O emissions from managed  soils Gg N2O": "3.C.5",
+        },
+        "entity": "N2O",
+        "unit": "Gg",
+    },
+    "bio_waste": {
+        "page_defs": {
+            "157": {
+                "area": ["68,748,541,228"],
+                "cols": ["108,176,222,283,332,387,429"],
+            },
+        },
+        "last_year": "2020",
+        "rows_to_fix": {
+            2: [
+                "Year",
+            ],
+        },
+        "year_column": "Year",
+        # # TODO: technically duplicate, just with a different unit
+        "categories_to_drop": [
+            "Total emissions from SWDS Gg CO2e",
+            "Food",
+            "Garden",
+            "Paper Gg CH4",
+            "Wood",
+            "Textile",
+        ],
+        "cat_codes_manual": {
+            # TODO the categories are made up placeholders
+            # "Food": "4.A.1.food",
+            # "Garden": "4.A.1.garden",
+            # "Paper Gg CH4": "4.A.1.paper",
+            # "Wood": "4.A.1.wood",
+            # "Textile": "4.A.1.textile",
+            "Total": "4.A.1.",
+        },
+        "entity": "CH4 ",
+        "unit": "Gg",
+    },
+    "wastewater": {
+        "page_defs": {
+            "161": {
+                "area": ["60,480,541,85"],
+                "cols": ["98,165,226,281,340,408,465"],
+            },
+            "162": {
+                "area": ["62,775,541,613"],
+                "cols": ["110,176,229,288,349,414,486"],
+            },
+        },
+        "last_year": "2020",
+        "col_to_use": 7,
+        "rows_to_fix": {
+            10: [
+                "Wastewater",
+            ],
+        },
+        "year_column": "Year",
+        # # TODO: technically duplicate, just with a different unit
+        "categories_to_drop": [
+            "Domestic wastewater  CH4 emissions",
+            "Domestic wastewater  N2O emissions (Gg C",
+            "Industrial wastewater  CH4 emissions O2 e)",
+            "Wastewater treatment and discharge  Total emissions",
+        ],
+        "cat_codes_manual": {
+            "Domestic wastewater  CH4 emissions (Gg CH4)": "4.D.1",
+            "Domestic wastewater  N2O emissions (Gg N2O)": "4.D.1",
+            "Industrial wastewater  CH4 emissions (Gg CH4)": "4.D.2",
+        },
+        "multi_entity": {
+            "unit": ["Gg", "Gg", "Gg"],
+            "entity": [
+                "CH4",
+                "N2O",
+                "CH4",
+            ],
+        },
+    },
+}
+
 # primap2 format conversion
 coords_cols = {
     "category": "category",
@@ -326,35 +721,30 @@ meta_data = {
 country_processing_step1 = {
     "tolerance": 0.01,
     "aggregate_cats": {
-        # TODO: Remove "M.3.C.AG". Just here to see previous aggregation setup.
-        # "M.3.C.AG": {
-        #     "sources": [
-        #         "3.C.1",
-        #         "3.C.2",
-        #         "3.C.3",
-        #         "3.C.4",
-        #         "3.C.5",
-        #         "3.C.6",
-        #         "3.C.7",
-        #         "3.C.8",
-        #     ],
-        #     "name": "Aggregate sources and non-CO2 emissions sources on land "
-        #     "(Agriculture)",
-        # },
-        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
-        # TODO: In this case 3.C should be equivalent to M.3.C.AG, but I'm not sure.
+        "M.3.D.AG": {"sources": ["3.D.2"]},
+        "M.3.C.AG": {
+            "sources": ["3.C.1", "3.C.4", "3.C.5"],
+        },
         "M.AG.ELV": {
-            "sources": ["3.C", "M.3.D.AG"],
-            "name": "Agriculture excluding livestock",
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
         },
-        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
-        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
-        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        # "3.A" : {"sources" : ["3.A.1", "3.A.2"]},
+        # "3.C" : {"sources" : ["3.C.1",
+        #                       "3.C.2",
+        #                       "3.C.3",
+        #                       "3.C.4",
+        #                       "3.C.5",
+        #                       "3.C.6",
+        #                       "3.C.7",
+        #                       "3.C.8", ]},
+        # "3.D" : {"sources" : ["3.D.1", "3.D.2"]},
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"]},
+        "M.3.D.LU": {"sources": ["3.D.1"]},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"]},
         "M.0.EL": {
-            "sources": ["1", "2", "M.AG", "4", "5"],
-            "name": "National total emissions excluding LULUCF",
+            "sources": ["1", "2", "M.AG", "4"],
         },
-        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},  # consistency check
+        "3": {"sources": ["M.AG", "M.LULUCF"]},  # consistency check
         "0": {"sources": ["1", "2", "3", "4"]},  # consistency check
     },
     "basket_copy": {
@@ -362,8 +752,36 @@ country_processing_step1 = {
         "entities": ["HFCS", "PFCS"],
         "source_GWP": gwp_to_use,
     },
+    "downscale": {
+        "sectors": {
+            "1.B_CH4": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2"],
+                "entities": ["CH4"],
+                "dim": f"category ({coords_terminologies['category']})",
+                # "tolerance": 0.05,  # some inconsistencies (rounding?)
+            },
+            "1.B_CO2": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2"],
+                "entities": ["CO2"],
+                "dim": f"category ({coords_terminologies['category']})",
+                "sel": {
+                    "time": [
+                        "2000",
+                        "2005",
+                        "2010",
+                        "2015",
+                        "2020",
+                    ]
+                },
+            },
+        }
+    },
 }
 
+country_processing_gas_baskets = {"tolerance": 0.02}
+
 gas_baskets = {
     "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
     "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],

+ 263 - 21
src/unfccc_ghg_data/unfccc_reader/Mongolia/read_MNG_BUR2_from_pdf.py

@@ -1,30 +1,36 @@
 """
 Read Mongolia's BUR2 from pdf
 """
+
 import camelot
 import pandas as pd
 import primap2 as pm2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    process_data_for_country,
-)
-from unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2 import (
+from config_mng_bur2 import (
     coords_cols,
     coords_defaults,
     coords_terminologies,
     coords_value_mapping,
+    country_processing_gas_baskets,
     country_processing_step1,
     filter_remove,
     gas_baskets,
     inv_conf,
+    inv_conf_harvested_wood_products,
     inv_conf_per_entity,
+    inv_conf_per_sector,
     inv_conf_per_year,
     meta_data,
 )
 
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    process_data_for_country,
+)
+
+# pd.options.mode.chained_assignment = None  # default='warn'
+
 if __name__ == "__main__":
     # ###
     # configuration
@@ -92,7 +98,7 @@ if __name__ == "__main__":
 
         df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
 
-        skip_rows = 11
+        skip_rows = inv_conf_per_year[year]["skip_rows"]
         df_year = pd.concat(
             [df_header, df_year[skip_rows:]], axis=0, join="outer"
         ).reset_index(drop=True)
@@ -218,7 +224,8 @@ if __name__ == "__main__":
                 )
 
         df_entity.columns = df_entity.iloc[0, :]
-        df_entity = df_entity[1:]
+        # make a copy to avoid SettingWithCopyWarning
+        df_entity = df_entity[1:].copy()
 
         # unit is always Gg
         df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
@@ -251,6 +258,10 @@ if __name__ == "__main__":
         for year in inv_conf_per_entity[entity]["years"]:
             df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
 
+        # if "del_value" in inv_conf_per_entity[entity]:
+        #     for year_del, category_del in inv_conf_per_entity[entity]["del_value"]:
+        #         df_entity.loc[df_entity["category"] == category_del, year_del] = ""
+
         if df_trend is None:
             df_trend = df_entity
         else:
@@ -278,16 +289,235 @@ if __name__ == "__main__":
     data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
 
     # ###
-    # Merge main and trend tables.
+    # 3 Read harvested wood products table
     # ###
 
-    print("Merging main and trend table.")
-    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
+    # The table for harvested wood products is in a different format
+    # and needs to be read in separately.
+
+    print("-" * 60)
+    print("Reading sector harvested wood products table.")
+    print("-" * 60)
+
+    df_hwp = None
+    for part in [*inv_conf_harvested_wood_products["parts"]]:
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=inv_conf_harvested_wood_products["page"],
+            table_areas=inv_conf_harvested_wood_products["parts"][part]["page_defs"][
+                "area"
+            ],
+            columns=inv_conf_harvested_wood_products["parts"][part]["page_defs"][
+                "cols"
+            ],
+            flavor="stream",
+            split_text=True,
+        )
+
+        df_hwp_part = tables_inventory_original[0].df
+
+        if "rows_to_fix" in inv_conf_harvested_wood_products["parts"][part]:
+            for n_rows in inv_conf_harvested_wood_products["parts"][part][
+                "rows_to_fix"
+            ].keys():
+                df_hwp_part = fix_rows(
+                    df_hwp_part,
+                    rows_to_fix=inv_conf_harvested_wood_products["parts"][part][
+                        "rows_to_fix"
+                    ][n_rows],
+                    col_to_use=0,
+                    n_rows=n_rows,
+                )
+
+        df_hwp_part = df_hwp_part.drop(1, axis=0).reset_index(drop=True)
+
+        if df_hwp is None:
+            df_hwp = df_hwp_part
+        else:
+            # stack horizontally
+            df_hwp = pd.concat(
+                [df_hwp, df_hwp_part.drop(0, axis=1)],
+                axis=1,
+                join="outer",
+            ).reset_index(drop=True)
+
+    # assign the years to the columns
+    df_hwp = pd.DataFrame(df_hwp.to_numpy()[1:], columns=df_hwp.iloc[0])
+
+    df_hwp = df_hwp.rename(
+        columns={inv_conf_harvested_wood_products["category_column"]: "category"}
+    )
+
+    df_hwp.loc[:, "category"] = df_hwp.loc[:, "category"].replace(
+        inv_conf_harvested_wood_products["cat_codes_manual"]
+    )
+
+    # unit is always the same
+    df_hwp.loc[:, "unit"] = inv_conf_harvested_wood_products["unit"]
+
+    # and only one entity per table
+    df_hwp.loc[:, "entity"] = inv_conf_harvested_wood_products["entity"]
 
     # ###
-    # Save raw data to IF and native format.
+    # 4. Read in aggregated tables from 1990 - 2020
     # ###
 
+    df_agg = None
+
+    for sector in list(inv_conf_per_sector.keys()):
+        print("-" * 60)
+        print(
+            f"Reading sector {sector} on page(s) \
+            {[*inv_conf_per_sector[sector]['page_defs']]}."
+        )
+
+        df_sector = None
+
+        for page in [*inv_conf_per_sector[sector]["page_defs"]]:
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                table_areas=inv_conf_per_sector[sector]["page_defs"][page]["area"],
+                columns=inv_conf_per_sector[sector]["page_defs"][page]["cols"],
+                flavor="stream",
+                split_text=True,
+            )
+
+            df_sector_page = tables_inventory_original[0].df
+
+            if df_sector is None:
+                df_sector = df_sector_page
+            else:
+                df_sector = pd.concat(
+                    [df_sector, df_sector_page],
+                    axis=0,
+                    join="outer",
+                ).reset_index(drop=True)
+
+            print(f"adding table from page {page}.")
+
+        last_row = df_sector.loc[df_sector[0] == "2020"].index[0]
+
+        df_sector = df_sector[0 : last_row + 1]
+
+        if "rows_to_fix" in inv_conf_per_sector[sector]:
+            for n_rows in inv_conf_per_sector[sector]["rows_to_fix"].keys():
+                print(f"Merge content for {n_rows=}")
+                # set the row
+                if "col_to_use" in inv_conf_per_sector[sector].keys():
+                    col_to_use = inv_conf_per_sector[sector]["col_to_use"]
+                else:
+                    col_to_use = 0
+                df_sector = fix_rows(
+                    df_sector,
+                    rows_to_fix=inv_conf_per_sector[sector]["rows_to_fix"][n_rows],
+                    col_to_use=col_to_use,
+                    n_rows=n_rows,
+                )
+
+        df_sector = df_sector.reset_index(drop=True)
+
+        if "rows_to_drop" in inv_conf_per_sector[sector]:
+            for row in inv_conf_per_sector[sector]["rows_to_drop"]:
+                df_sector = df_sector.drop(index=row)
+
+        # TODO: Is it necessary to set the index here?
+        df_sector = df_sector.set_index(0)
+
+        # transpose so categegories are in first columns
+        df_sector = df_sector.T
+
+        # strip white spaces from column names
+        df_sector.columns = df_sector.columns.str.strip()
+
+        df_sector = df_sector.rename(
+            columns={inv_conf_per_sector[sector]["year_column"]: "category"}
+        )
+
+        df_sector["category"] = df_sector["category"].str.strip()
+        df_sector["category"] = df_sector["category"].str.replace("\n", "")
+
+        # TODO This is the same functionality as remove_duplicates ?
+        if "categories_to_drop" in inv_conf_per_sector[sector]:
+            for row in inv_conf_per_sector[sector]["categories_to_drop"]:
+                row_to_delete = df_sector.index[df_sector["category"] == row][0]
+                df_sector = df_sector.drop(index=row_to_delete)
+
+        df_sector.loc[:, "category"] = df_sector.loc[:, "category"].replace(
+            inv_conf_per_sector[sector]["cat_codes_manual"]
+        )
+
+        if "multi_entity" in inv_conf_per_sector[sector]:
+            df_sector["entity"] = inv_conf_per_sector[sector]["multi_entity"]["entity"]
+            df_sector["unit"] = inv_conf_per_sector[sector]["multi_entity"]["unit"]
+
+        else:
+            # unit is always the same
+            df_sector.loc[:, "unit"] = inv_conf_per_sector[sector]["unit"]
+
+            # and only one entity per table
+            df_sector.loc[:, "entity"] = inv_conf_per_sector[sector]["entity"]
+
+        # Some categories are in two tables (summary and sector)
+        # Duplicates need to be removed
+        if "remove_duplicates" in inv_conf_per_sector[sector]:
+            for row in inv_conf_per_sector[sector]["remove_duplicates"]:
+                row_to_delete = df_sector.index[df_sector["category"] == row][0]
+                df_sector = df_sector.drop(index=row_to_delete)
+
+        if df_agg is None:
+            df_agg = df_sector
+        else:
+            df_agg = pd.concat(
+                [df_agg, df_sector],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+        for year in [str(y) for y in range(1990, 2021)]:
+            df_agg.loc[:, year] = df_agg[year].str.replace(",", "")
+
+    # add harvested wood products table and all the other sectors together
+    df_agg = pd.concat(
+        [df_agg, df_hwp],
+        axis=0,
+        join="outer",
+    ).reset_index(drop=True)
+
+    # There are more tables in the document that could be read, but are less relevant
+    # on pages 67, 78, 91, 105/6, 110/111
+
+    ### convert to interchange format ###
+    df_agg_IF = pm2.pm2io.convert_wide_dataframe_if(
+        data_wide=df_agg,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
+
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_agg_pm2 = pm2.pm2io.from_interchange_format(df_agg_IF)
+
+    # # ###
+    # # Merge tables.
+    # # ###
+
+    print("Merging main and trend table.")
+    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
+
+    print("Merging sector tables.")
+    data_pm2 = data_pm2.pr.merge(data_agg_pm2, tolerance=1)
+
+    # # ###
+    # # Save raw data to IF and native format.
+    # # ###
+
     data_if = data_pm2.pr.to_interchange_format()
 
     pm2.pm2io.write_interchange_format(
@@ -302,11 +532,12 @@ if __name__ == "__main__":
         encoding=encoding,
     )
 
-    # ###
-    # Processing
-    # ###
+    # # ###
+    # # Processing
+    # # ###
 
-    data_proc_pm2 = process_data_for_country(
+    # create the gas baskets before aggregating the categories
+    data_proc_pm2_gas_baskets = process_data_for_country(
         data_country=data_pm2,
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
@@ -314,12 +545,23 @@ if __name__ == "__main__":
         cat_terminology_out=None,
         category_conversion=None,
         sectors_out=None,
+        processing_info_country=country_processing_gas_baskets,
+    )
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_proc_pm2_gas_baskets,
+        entities_to_ignore=[],
+        gas_baskets=None,
+        filter_dims=None,
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
         processing_info_country=country_processing_step1,
     )
 
-    # ###
-    # save processed data to IF and native format
-    # ###
+    # # ###
+    # # save processed data to IF and native format
+    # # ###
 
     terminology_proc = coords_terminologies["category"]