32 Commits ee7ab03794 ... c6e0a4aa05

Author SHA1 Message Date
  Johannes Gütschow c6e0a4aa05 Merge remote-tracking branch 'github/main' into CCPI_nAI_2024 4 months ago
  Johannes Gütschow 692dc70a45 Merge pull request #108 from JGuetschow/dependabot/pip/cryptography-43.0.1 4 months ago
  crdanielbusch 8803e8a39c Merge pull request #99 from JGuetschow/cabo-verde 4 months ago
  dependabot[bot] 09dd29834c Bump cryptography from 42.0.8 to 43.0.1 4 months ago
  Johannes Gütschow b6ab1e67aa Merge pull request #107 from JGuetschow/dependabot/pip/jupyterlab-4.2.5 4 months ago
  Johannes Gütschow d6bcee5634 Merge pull request #106 from JGuetschow/dependabot/pip/notebook-7.2.2 4 months ago
  dependabot[bot] 42897edc03 Bump jupyterlab from 4.2.2 to 4.2.5 4 months ago
  dependabot[bot] 4d959bf252 Bump notebook from 7.2.1 to 7.2.2 4 months ago
  Johannes Gütschow 9dcd853311 Merge remote-tracking branch 'refs/remotes/origin/main' into cabo-verde 5 months ago
  Johannes Gütschow c4fcc62b1b [DATALAD RUNCMD] Read data for CPV, BUR1. 5 months ago
  Johannes Gütschow 957208c7dd Fixed to CPV BUR1 processing 5 months ago
  Johannes Gütschow 778bd85377 [DATALAD RUNCMD] Read data for CPV, BUR1. 5 months ago
  Johannes Gütschow 700a937702 [DATALAD RUNCMD] Update folder mapping for src/unfccc_ghg_data/unfccc_reader 5 months ago
  Daniel Busch 30c292dbbb clean up 5 months ago
  Daniel Busch b29802e91f downscaling 5 months ago
  Daniel Busch 9a3330206c F-gases conversion 5 months ago
  Daniel Busch 79a820214b downscaling 2 5 months ago
  Daniel Busch d5d3f88038 downscaling 1 and 2 5 months ago
  Daniel Busch 67c3ab3a35 cat aggregation 5 months ago
  Daniel Busch 3da2fdb38c merge from main 5 months ago
  Daniel Busch 4b6161574d Merge remote-tracking branch 'refs/remotes/github/main' into cabo-verde 5 months ago
  crdanielbusch f85b63d264 Update src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/__init__.py 5 months ago
  Johannes Gütschow 7c02e41474 Merge remote-tracking branch 'refs/remotes/origin/main' into cabo-verde 5 months ago
  Daniel Busch 8d308f87d5 processing: additional cats 5 months ago
  Daniel Busch 5da3fcc34e processing: category aggregation consistency checks and gas baskets 5 months ago
  Daniel Busch 1a6a6a0c69 all tables merged in pm2 format, saved raw 5 months ago
  Daniel Busch 2c5616c8fa main table 5 months ago
  Daniel Busch 8334870494 main table from page 86 - unprocessed 5 months ago
  Daniel Busch fa847ef62b tables on page 33, 39 5 months ago
  Daniel Busch e543c37cf5 iso3 country code 5 months ago
  Daniel Busch 428611fc60 updates for docs and ci from Bangladesh branch 5 months ago
  Daniel Busch fa2c8104f9 test push 5 months ago

+ 1 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/wk/g9/MD5E-s91654--1b95a9d247742af049b9f8a083aa26ac.csv/MD5E-s91654--1b95a9d247742af049b9f8a083aa26ac.csv

+ 1 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/WG/MG/MD5E-s116273--5d8b2fdf2a5a7dcc52a25307d83c1a2e.nc/MD5E-s116273--5d8b2fdf2a5a7dcc52a25307d83c1a2e.nc

+ 24 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP.yaml

@@ -0,0 +1,24 @@
+attrs:
+  references: unfccc.int/documents/638907
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Cabo Verde. Biennial update report (BUR). BUR1 Processed on 2024-08-12 Processed
+    on 2024-08-12
+  comment: Read fom pdf by Daniel Busch Processed on 2024-08-12 Processed on 2024-08-12
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+  gwp_context: SARGWP100
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - area (ISO3)
+  - provenance
+  - category (IPCC2006_PRIMAP)
+  - scenario (PRIMAP)
+  - entity
+  - unit
+data_file: CPV_BUR1_2023_IPCC2006_PRIMAP.csv

+ 1 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP_raw.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/W9/5z/MD5E-s20131--304eefaee6e626f07975e16cd490aa8a.csv/MD5E-s20131--304eefaee6e626f07975e16cd490aa8a.csv

+ 1 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP_raw.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/mm/Xm/MD5E-s55465--1344f4ebe111241c209c354e320de338.nc/MD5E-s55465--1344f4ebe111241c209c354e320de338.nc

+ 22 - 0
extracted_data/UNFCCC/Cabo_Verde/CPV_BUR1_2023_IPCC2006_PRIMAP_raw.yaml

@@ -0,0 +1,22 @@
+attrs:
+  references: unfccc.int/documents/638907
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Cabo Verde. Biennial update report (BUR). BUR1
+  comment: Read fom pdf by Daniel Busch
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - area (ISO3)
+  - provenance
+  - category (IPCC2006_PRIMAP)
+  - scenario (PRIMAP)
+  - entity
+  - unit
+data_file: CPV_BUR1_2023_IPCC2006_PRIMAP_raw.csv

File diff suppressed because it is too large
+ 354 - 413
poetry.lock


+ 1 - 1
pyproject.toml

@@ -50,7 +50,7 @@ ruff = "^0.1.8"
 pre-commit = "^3.3.1"
 towncrier = "^23.6.0"
 liccheck = "^0.9.1"
-notebook = "^7.2.0"
+notebook = "^7.2.2"
 ipywidgets = "^8.1.2"
 ipympl = "^0.9.4"
 

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/__init__.py

@@ -0,0 +1,30 @@
+"""Read Cabo Verde's BURs, NIRs, NCs
+
+Scripts and configurations to read Cabo Verde's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'CPV'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=CPV
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 374 - 0
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/config_cpv_bur1.py

@@ -0,0 +1,374 @@
+"""
+Configuration for Cabo Verde BUR1 (read from pdf)
+"""
+
+# reading tables on pages:
+# 33/1, GHG emissions and removals by type of gas, by sector and by year
+# 39, Total GHG Emissions, in CO2eq, for international bunkers, in 1995, 2000, 2005, 2010, 2015 and 2019
+# 86-89, GHG emissions in 2019
+# Not reading tables on pages:
+# 37/38, has additional columns on PFCs, Unspecified mixture of HFCs and PFCs,
+# and SF6, but they are all empty
+# 32, same information as in table 33/1
+# 33/2, aggregation of table 33/1
+# 43, no new information here
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+# primap2 format conversion
+coords_cols = {
+    "category": "category",
+    "entity": "entity",
+    "unit": "unit",
+}
+
+coords_defaults = {
+    "source": "CPV-GHG-Inventory",
+    "provenance": "measured",
+    "area": "CPV",
+    "scenario": "BUR1",
+}
+
+gwp_to_use = "SARGWP100"
+
+coords_value_mapping_main = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+    "entity": {
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "HFC": f"HFCS ({gwp_to_use})",
+    },
+}
+
+coords_value_mapping = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+    "entity": {
+        "CO²": "CO2",
+        "CH⁴": "CH4",
+        "N²O": "N2O",
+        "F-gases": f"FGASES ({gwp_to_use})",
+    },
+}
+
+filter_remove = {
+    "f_memo": {"category": "MEMO"},
+    # They are all NaN and don't match a pre-defined entity
+    "f_fluor": {"entity": "Other fluorinated products"},
+}
+
+meta_data = {
+    "references": "unfccc.int/documents/638907",
+    "rights": "",  # unknown
+    "contact": "daniel-busch@climate-resource.de",
+    "title": "Cabo Verde. Biennial update report (BUR). BUR1",
+    "comment": "Read fom pdf by Daniel Busch",
+    "institution": "UNFCCC",
+}
+
+trend_years = ["1995", "2000", "2005", "2010", "2015", "2019"]
+
+inv_conf_per_sector = {
+    "main": {
+        "page": "33",
+        "skip_rows_start": 2,
+        "cat_codes_manual": {
+            "Energy": "1",
+            "IPPU": "2",
+            "Agriculture": "M.AG",
+            "LULUCF": "M.LULUCF",
+            "Waste": "4",
+        },
+        "header": ["category", "entity", *trend_years],
+        "unit": ["Gg"] * 4 + ["GgCO2eq"] + ["Gg"] * 9,
+        "unit_conversion": {
+            "index": 6,
+            "conversion_factor": 2240.625,
+        },
+        # "unit": ["Gg"] * 4 + ["Gg CO2eq"] + ["Gg"] * 9,
+    },
+    "int_bunkers": {
+        "page": "39",
+        "skip_rows_start": 2,
+        "cat_codes_manual": {
+            "Total International Bunkers": "M.BK",
+            "International aviation": "M.BK.A",
+            "International shipping": "M.BK.M",
+        },
+        "header": ["category", *trend_years],
+        "unit": "Gg CO2eq",
+        "drop_cols": 7,
+        "entity": "KYOTOGHG (SARGWP100)",
+    },
+}
+
+inv_conf = {
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+    "year": "2019",
+    # TODO check again!
+    # "CO2 emissions from Biomass" and "CO2 emissions from using manure as energy" are the same category
+    "merge_cats": "MBIO",
+}
+
+inv_conf_main = {
+    "pages": {
+        "86": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "Memo items": "MEMO",
+                "International bunkers": "M.BK",
+                "CO² emissions from Biomass": "M.BIO",
+                "CO² emissions from using manure as energy": "M.BIO",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+            # "units" : ["no unit", "Gg", "Gg", "Gg"]
+        },
+        "87": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O", "HFCs", "Other fluorinated products"],
+            "column_names": [
+                "category",
+                "CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+                "Other fluorinated products",
+            ],
+            "cat_codes_manual": {
+                "2F. Use of products as substitutes for \nsubstances that degrade the ozone layer": "2.F",
+                "2B4. Production of caprolactam, \nglyoxal and glyoxylic acid": "2.B.4",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+                "HFCs": "Gg CO2eq",
+            },
+        },
+        "88": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+        },
+        "89": {
+            "skip_rows_start": 2,
+            "entities": ["CO2", "CH4", "N2O"],
+            "column_names": ["category", "CO2", "CH4", "N2O"],
+            "cat_codes_manual": {
+                "3C6. Indirect emissions of N²O from manure \nmanagement": "3.C.6",
+                "3C. Aggregate Sources and Sources of Non-CO²\nEmissions in the soil": "3.C",
+            },
+            "unit_for_entity": {
+                "CO2": "Gg",
+                "CH4": "Gg",
+                "N2O": "Gg",
+            },
+        },
+    },
+}
+
+country_processing_step1 = {
+    # rounding error 0.038 for yr2019/entN2O/cat4: 0.011 + 0.015 != 0.027
+    "tolerance": 0.04,
+    "aggregate_cats": {
+        # First generate additional categories
+        "0": {"sources": ["1", "2", "3", "4", "5"]},
+        "2.A": {"sources": ["2.A.1", "2.A.2", "2.A.3", "2.A.4", "2.A.5"]},
+        "2.C": {
+            "sources": [
+                "2.C.1",
+                "2.C.2",
+                "2.C.3",
+                "2.C.4",
+                "2.C.5",
+                "2.C.6",
+                "2.C.7",
+            ]
+        },
+        "2.E": {"sources": ["2.E.1", "2.E.2", "2.E.3", "2.E.4", "2.E.5"]},
+        "3.C": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ]
+        },
+        "3.D": {"sources": ["3.D.2"]},
+        # 3.D would go into M.LULUCF as well but we don't have it
+        "M.LULUCF": {"sources": ["3.B"]},
+        # Only 3.C.7 in table, but values are all zero or empty
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ]
+        },
+        # 3.D.2 is all zeros
+        "M.3.D.AG": {"sources": ["3.D.2"]},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"]},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4"],
+        },
+        "4.D": {"sources": ["4.D.1", "4.D.2"]},  # consistency check
+        "1": {"sources": ["1.A"]},  # consistency check
+        "1.A": {
+            "sources": ["1.A.1", "1.A.2", "1.A.3", "1.A.4", "1.A.5"]
+        },  # consistency check
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"]
+        },  # consistency check
+        "3": {"sources": ["M.AG", "M.LULUCF"]},  # consistency check
+        # "3.A": {"sources": ["3.A.1", "3.A.2"]}, # consistency check
+        "4": {"sources": ["4.A", "4.B", "4.C", "4.D", "4.E"]},  # consistency check
+    },
+}
+
+country_processing_step2 = {
+    "downscale": {
+        "sectors": {
+            "1_all": {
+                "basket": "1",
+                "basket_contents": ["1.A"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": f'category ({coords_terminologies["category"]})',
+            },
+            # Values for 1995/2000/2005/2010/2015/2019 are only available for CO2 and F-gases (table 6)
+            "2_CO2": {
+                "basket": "2",
+                "basket_contents": ["2.A", "2.B", "2.C", "2.D", "2.H"],
+                "entities": ["CO2"],
+                "dim": f'category ({coords_terminologies["category"]})',
+            },
+            # "2_KYOTO": {  # commented as KYOTOGHG only present for bunkers, so aggregate later
+            #     # error of 3.5% for KYOTOGHG (AR6GWP100)
+            #     "check_consistency": False,
+            #     "basket": "2",
+            #     "basket_contents": [
+            #         "2.A",
+            #         "2.B",
+            #         "2.C",
+            #         "2.D",
+            #         "2.E",
+            #         "2.F",
+            #         "2.G",
+            #         "2.H",
+            #     ],
+            #     "entities": [
+            #         "KYOTOGHG (SARGWP100)",
+            #         "KYOTOGHG (AR4GWP100)",
+            #         "KYOTOGHG (AR5GWP100)",
+            #         "KYOTOGHG (AR6GWP100)",
+            #     ],
+            #     "dim": f'category ({coords_terminologies["category"]})',
+            # },
+            # "2_FGASES": {  # f-gases have no sectoral detail and HFCs only 2019,
+            #     # no downscaling of original data possible
+            #     # error of 3.5% for KYOTOGHG (AR6GWP100)
+            #     "check_consistency": False,
+            #     "basket": "2",
+            #     "basket_contents": [
+            #         "2.B",
+            #         "2.C",
+            #         "2.E",
+            #         "2.F",
+            #         "2.G",
+            #     ],
+            #     "entities": [
+            #         "FGASES (SARGWP100)",
+            #         "FGASES (AR4GWP100)",
+            #         "FGASES (AR5GWP100)",
+            #         "FGASES (AR6GWP100)",
+            #         "HFCS (SARGWP100)",
+            #         "HFCS (AR4GWP100)",
+            #         "HFCS (AR5GWP100)",
+            #         "HFCS (AR6GWP100)",
+            #     ],
+            #     "dim": f'category ({coords_terminologies["category"]})',
+            # },
+            "3_CH4": {
+                "basket": "3",
+                "basket_contents": ["3.A", "3.B", "3.C", "3.D"],
+                "entities": ["CH4"],
+                "dim": f'category ({coords_terminologies["category"]})',
+            },
+            "3_CO2": {
+                "basket": "3",
+                "basket_contents": ["3.B", "3.C", "3.D"],
+                "entities": ["CO2"],
+                "dim": f'category ({coords_terminologies["category"]})',
+            },
+            "3_N2O": {
+                "basket": "3",
+                "basket_contents": ["3.A", "3.B", "3.C", "3.D"],
+                "entities": ["N2O"],
+                "dim": f'category ({coords_terminologies["category"]})',
+            },
+            # "3_KYOTO": {  # no original data here. aggregate basket later
+            #     "basket": "3",
+            #     "basket_contents": ["3.A", "3.B", "3.C", "3.D"],
+            #     "entities": [
+            #         "KYOTOGHG (SARGWP100)",
+            #         "KYOTOGHG (AR4GWP100)",
+            #         "KYOTOGHG (AR5GWP100)",
+            #         "KYOTOGHG (AR6GWP100)",
+            #     ],
+            #     "dim": f'category ({coords_terminologies["category"]})',
+            # },
+        },
+        "entities": {
+            "FGASES": {
+                "basket": "FGASES (SARGWP100)",
+                "basket_contents": ["HFCS (SARGWP100)"],
+            },
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS"],
+        "source_GWP": gwp_to_use,
+    },
+}
+
+gas_baskets = {
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
+}

+ 331 - 0
src/unfccc_ghg_data/unfccc_reader/Cabo_Verde/read_CPV_BUR1_from_pdf.py

@@ -0,0 +1,331 @@
+"""
+Read Cabo Verde's BUR1 from pdf
+"""
+
+import camelot
+import numpy as np
+import pandas as pd
+import primap2 as pm2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
+from unfccc_ghg_data.unfccc_reader.Cabo_Verde.config_cpv_bur1 import (
+    coords_cols,
+    coords_defaults,
+    coords_terminologies,
+    coords_value_mapping,
+    coords_value_mapping_main,
+    country_processing_step1,
+    country_processing_step2,
+    filter_remove,
+    gas_baskets,
+    inv_conf,
+    inv_conf_main,
+    inv_conf_per_sector,
+    meta_data,
+    trend_years,
+)
+
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+
+    # for regex later
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    input_folder = downloaded_data_path / "UNFCCC" / "Cabo_Verde" / "BUR1"
+    output_folder = extracted_data_path / "UNFCCC" / "Cabo_Verde"
+
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    pdf_file = "BUR_EN_Digital.pdf"
+    output_filename = "CPV_BUR1_2023_"
+    category_column = f"category ({coords_terminologies['category']})"
+    compression = dict(zlib=True, complevel=9)
+
+    # ###
+    # 1. Read sector-specific main tables for 2019
+    # ###
+
+    df_main = None
+    for page in reversed(inv_conf_main["pages"].keys()):
+        print(f"Read table on page {page}")
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=page,
+            flavor="lattice",
+            split_text=True,
+        )
+
+        df_page = tables_inventory_original[0].df
+
+        skip_rows_start = inv_conf_main["pages"][page]["skip_rows_start"]
+        if not skip_rows_start == 0:
+            df_page = df_page[skip_rows_start:]
+
+        df_page.columns = inv_conf_main["pages"][page]["column_names"]
+
+        # first the manual replacements
+        df_page["category"] = df_page["category"].replace(
+            inv_conf_main["pages"][page]["cat_codes_manual"]
+        )
+
+        # Remove dots between letters in category codes
+        df_page["category"] = df_page["category"].str.replace(".", "")
+        # Some categories have a dash between the letters
+        df_page["category"] = df_page["category"].str.replace("-", " ")
+
+        # then the regex replacements
+        df_page["category"] = df_page["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
+
+        df_page = pd.melt(
+            df_page,
+            id_vars="category",
+            value_vars=inv_conf_main["pages"][page]["entities"],
+        )
+
+        df_page = df_page.rename({"value": "data", "variable": "entity"}, axis=1)
+
+        df_page["data"] = df_page["data"].str.replace(",", ".")
+
+        # df_page["unit"] = df_page["entity"]
+
+        # set unit based on entity
+        df_page["unit"] = df_page["entity"].replace(
+            inv_conf_main["pages"][page]["unit_for_entity"]
+        )
+
+        # stack the tables vertically
+        if df_main is None:
+            df_main = df_page
+        else:
+            df_main = pd.concat(
+                [
+                    df_main,
+                    df_page,
+                ],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    df_main["time"] = inv_conf["year"]
+
+    # remove wrong codes in data column
+    df_main["data"] = df_main["data"].str.replace("HFC", "")
+
+    # Sum up the values for duplicate categories
+    cat = inv_conf["merge_cats"]
+    df_temp = df_main.loc[df_main["category"] == cat]
+    df_temp["data"] = df_temp["data"].replace("", np.nan).apply(float)
+    df_temp = df_temp.groupby(
+        ["category", "entity", "unit", "time"], as_index=False
+    ).sum()
+    # change back to empty strings
+    df_temp = df_temp.replace(0, "")
+    # drop category from df
+    df_main = df_main.drop(df_main[df_main["category"] == cat].index)
+    # append the summed up sub-set
+    df_main = pd.concat(
+        [df_main, df_temp],
+        axis=0,
+        join="outer",
+    ).reset_index(drop=True)
+
+    df_main_if = pm2.pm2io.convert_long_dataframe_if(
+        df_main,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping_main,
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
+
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_if)
+
+    # ###
+    # 2. Read trend tables 1995, 2000, 2005, 2010, 2015 and 2019
+    # ###
+    df_trend = None
+    for sector in reversed(inv_conf_per_sector.keys()):
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=inv_conf_per_sector[sector]["page"],
+            flavor="lattice",
+            split_text=True,
+        )
+
+        df_page = tables_inventory_original[0].df
+
+        # cut rows at the top if needed
+        skip_rows_start = inv_conf_per_sector[sector]["skip_rows_start"]
+        if not skip_rows_start == 0:
+            df_page = df_page[skip_rows_start:]
+
+        # drop columns if needed
+        if "drop_cols" in inv_conf_per_sector[sector].keys():
+            # print(df_current.columns.to_numpy())
+            df_page = df_page.drop(columns=inv_conf_per_sector[sector]["drop_cols"])
+
+        df_page.columns = inv_conf_per_sector[sector]["header"]
+
+        # fill empty strings with NaN and the forward fill category names
+        df_page["category"] = df_page["category"].replace("", np.nan).ffill()
+
+        # remove /n from category names
+        df_page["category"] = df_page["category"].str.replace("\n", "")
+        # manual replacement of categories
+        df_page["category"] = df_page["category"].replace(
+            inv_conf_per_sector[sector]["cat_codes_manual"]
+        )
+
+        # remove all thousand separator commas
+        for year in trend_years:
+            df_page[year] = df_page[year].str.replace(",", ".")
+
+        # add unit
+        df_page["unit"] = inv_conf_per_sector[sector]["unit"]
+
+        # add entity if needed
+        if "entity" in inv_conf_per_sector[sector].keys():
+            df_page["entity"] = inv_conf_per_sector[sector]["entity"]
+
+        if "unit_conversion" in inv_conf_per_sector[sector].keys():
+            for year in trend_years:
+                index = inv_conf_per_sector[sector]["unit_conversion"]["index"]
+                conv_factor = inv_conf_per_sector[sector]["unit_conversion"][
+                    "conversion_factor"
+                ]
+                df_page.loc[index, year] = str(
+                    conv_factor * float(df_page.loc[index, year])
+                )
+
+        # stack the tables vertically
+        if df_trend is None:
+            df_trend = df_page
+        else:
+            df_trend = pd.concat(
+                [
+                    df_trend,
+                    df_page,
+                ],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    df_trend_if = pm2.pm2io.convert_wide_dataframe_if(
+        df_trend,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+    )
+
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_if)
+
+    # ###
+    # Merge the main table for 2019 and the trend tables
+    # ###
+
+    print("Merging main table and trend tables")
+    print("Merging waste table.")
+    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2)  # , tolerance=0.10)
+
+    # # ###
+    # # Save raw data to IF and native format.
+    # # ###
+
+    data_if = data_pm2.pr.to_interchange_format()
+
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
+
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
+
+    # # ###
+    # # Processing
+    # # ###
+
+    # create the gas baskets before aggregating the categories
+    # data_proc_pm2_gas_baskets = process_data_for_country(
+    #     data_country=data_pm2,
+    #     entities_to_ignore=[],
+    #     gas_baskets=gas_baskets,
+    #     filter_dims=None,
+    #     cat_terminology_out=None,
+    #     category_conversion=None,
+    #     sectors_out=None,
+    #     processing_info_country=None,
+    # )
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_pm2,
+        entities_to_ignore=[],
+        gas_baskets={},  # gas_baskets,
+        filter_dims=None,
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
+        processing_info_country=country_processing_step1,
+    )
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_proc_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,  # None,
+        filter_dims=None,
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
+        processing_info_country=country_processing_step2,
+    )
+
+    # # ###
+    # # save processed data to IF and native format
+    # # ###
+
+    terminology_proc = coords_terminologies["category"]
+
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
+
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
+
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )
+
+    print("Saved processed data.")

Some files were not shown because too many files changed in this diff