Преглед изворни кода

Merge branch 'read-burundi-bur1' of jguetschow/UNFCCC_non-AnnexI_data into main

crdanielbusch пре 11 месеци
родитељ
комит
730404c11b

+ 217 - 0
UNFCCC_GHG_data/UNFCCC_reader/Burundi/config_BDI_BUR1.py

@@ -0,0 +1,217 @@
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+# define config dict
+inv_conf = {
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*",
+    "header": [
+        "Greenhouse gas source and sink categories",
+        "CO2",
+        "CH4",
+        "N2O",
+        "HFCs",
+        "PFCs",
+        "SF6",
+        "other halogenated gases",
+        "Other halogenated gases without CO2 equivalent conversion factors",
+        "NOx",
+        "CO",
+        "NMVOCs",
+        "SO2",
+    ],
+    "unit": [
+        "-",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+        "GgCO2eq",
+        "GgCO2eq",
+        "GgCO2eq",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+    ],
+    "cat_codes_manual": {
+        "Memo Items (5)": "MEMO",
+        "International Bunkers": "M.BK",
+        "1.A.3.a.i - International Aviation (International Bunkers) (1)": "M.BK.A",
+        "1.A.3.d.i - International water-borne navigation (International bunkers) (1)": "M.BK.M",
+        # TODO: Handle with regex instead of explicitly adding all options.
+        "1.A.3.d.i - International water-borne navigation (International                      bunkers) (1)": "M.BK.M",
+        "1.A.3.d.i - International water-borne navigation (International bunkers)                      (1)": "M.BK.M",
+        "1.A.5.c - Multilateral Operations (1)(2)": "M.MULTIOP",
+        "Total National Emissions and Removals": "0",
+    },
+}
+
+inv_conf_per_year = {
+    "2005": {
+        "pages_to_read": ["197", "198", "199", "200"],
+    },
+    "2006": {
+        "pages_to_read": ["201", "202", "203", "204"],
+    },
+    "2007": {
+        "pages_to_read": ["205", "206", "207", "208"],
+    },
+    "2008": {
+        "pages_to_read": ["209", "210", "211", "212"],
+    },
+    "2009": {
+        "pages_to_read": ["213", "214", "215", "216"],
+    },
+    "2010": {
+        "pages_to_read": ["221", "222", "223", "224"],
+    },
+    "2011": {
+        "pages_to_read": ["225", "226", "227", "228"],
+    },
+    "2012": {
+        "pages_to_read": ["229", "230", "231", "232"],
+    },
+    "2013": {
+        "pages_to_read": ["233", "234", "235", "236"],
+    },
+    "2014": {
+        "pages_to_read": ["237", "238", "239", "240"],
+    },
+    "2015": {
+        "pages_to_read": ["241", "242", "243", "244"],
+        # Some values move to wrong columns
+        "fix_values": [
+            (2, 10, "21,529"),
+            (1, 12, "NMVOCs"),
+            (2, 12, "0"),
+        ],
+        # for this table an additional column is created
+        # that needs to be deleted
+        "delete_columns": [11],
+    },
+    "2016": {
+        "pages_to_read": ["245", "246", "247", "248"],
+    },
+    "2017": {
+        "pages_to_read": ["249", "250", "251", "252"],
+    },
+    "2018": {
+        "pages_to_read": ["253", "254", "255", "256"],
+    },
+    "2019": {
+        "pages_to_read": ["257", "258", "259", "260"],
+    },
+}
+
+# primap2 format conversion
+coords_cols = {
+    "category": "category",
+    "entity": "entity",
+    "unit": "unit",
+}
+
+coords_defaults = {
+    "source": "BDI-GHG-Inventory",
+    "provenance": "measured",
+    "area": "BDI",
+    "scenario": "BUR1",
+}
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+# Page 64: The global warming potentials (GWPs) recommended by the IPCC Fifth Assessment Report (AR5)
+# and based on the annex to Decision 18/CMA.1 have been used to convert GHGs other than CO2
+# into their equivalent. These GWPs provide a consistent basis for comparing the relative effect
+# of emissions of all GHGs standardized over a 100-year period by converting emissions of other
+# GHGs into those of CO2. The values adopted for the three direct GHGs are 1 for CO2, 28 for CH4
+# and 265 for N2O.
+gwp_to_use = "AR5GWP100"
+coords_value_mapping = {
+    "unit": "PRIMAP1",
+    "category": "PRIMAP1",
+    "entity": {
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
+    },
+}
+
+filter_remove = {
+    "f_memo": {"category": "MEMO"},
+    "f_empty": {"category": ""},
+    # "f1": {
+    #     "entity": ["Other halogenated gases with CO2 equivalent conversion factors"],
+    # },
+    "f2": {
+        "entity": ["Other halogenated gases without CO2 equivalent conversion factors"],
+    },
+}
+
+meta_data = {
+    "references": "https://unfccc.int/documents/611668",
+    "rights": "",  # unknown
+    "contact": "daniel-busch@climate-resource.de",
+    "title": "Burundi. Biennial update report (BUR). BUR1",
+    "comment": "Read fom pdf by Daniel Busch",
+    "institution": "UNFCCC",
+}
+
+country_processing_step1 = {
+    "aggregate_cats": {
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land "
+            "(Agriculture)",
+        },
+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total emissions excluding LULUCF",
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
+    },
+}
+
+gas_baskets = {
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
+}

+ 226 - 0
UNFCCC_GHG_data/UNFCCC_reader/Burundi/read_BDI_BUR1_from_pdf.py

@@ -0,0 +1,226 @@
+import camelot
+import primap2 as pm2
+import pandas as pd
+
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper.functions import process_data_for_country
+
+from config_BDI_BUR1 import (
+    inv_conf,
+    meta_data,
+    filter_remove,
+    coords_value_mapping,
+    coords_terminologies,
+    coords_defaults,
+    coords_cols,
+    gas_baskets,
+    country_processing_step1,
+    inv_conf_per_year,
+)
+
+# ###
+# configuration
+# ###
+
+input_folder = downloaded_data_path / "UNFCCC" / "Burundi" / "BUR1"
+output_folder = extracted_data_path / "UNFCCC" / "Burundi"
+
+if not output_folder.exists():
+    output_folder.mkdir()
+
+pdf_file = "Burundi_BUR_1_Report__Francais.pdf"
+output_filename = "BDI_BUR1_2023_"
+category_column = f"category ({coords_terminologies['category']})"
+compression = dict(zlib=True, complevel=9)
+
+# ###
+# 1. Read in tables
+# ###
+
+df_all = None
+for year in inv_conf_per_year.keys():
+    print("-" * 60)
+    print(f"Reading year {year}.")
+    print("-" * 60)
+    df_year = None
+    for page in inv_conf_per_year[year]["pages_to_read"]:
+        print(f"Reading table from page {page}.")
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=page,
+            flavor="lattice",
+            split_text=True,
+        )
+        print("Reading complete.")
+
+        df_page = tables_inventory_original[0].df
+
+        if df_year is None:
+            df_year = df_page
+        else:
+            df_year = pd.concat(
+                [df_year, df_page],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print(f"Concatenating all tables for {year}.")
+    # remove line breaks
+    for column in df_year.columns:
+        df_year[column] = df_year[column].str.replace("\n", "")
+
+    # fix broken values in cells
+    if "fix_values" in inv_conf_per_year[year].keys():
+        for index, column, value in inv_conf_per_year[year]["fix_values"]:
+            df_year.at[index, column] = value
+
+    # delete extra columns
+    if "delete_columns" in inv_conf_per_year[year].keys():
+        for column in inv_conf_per_year[year]["delete_columns"]:
+            df_year = df_year.drop(columns=column)
+        df_year.columns = range(df_year.columns.size)
+
+    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+
+    df_year = pd.concat([df_header, df_year[2:]], axis=0, join="outer").reset_index(
+        drop=True
+    )
+
+    df_year = pm2.pm2io.nir_add_unit_information(
+        df_year,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
+
+    print("Added unit information.")
+
+    # set index
+    df_year = df_year.set_index(inv_conf["index_cols"])
+
+    # convert to long format
+    df_year_long = pm2.pm2io.nir_convert_df_to_long(
+        df_year, year, inv_conf["header_long"]
+    )
+
+    # extract from tuple
+    df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
+
+    # prep for conversion to PM2 IF and native format
+    # make a copy of the categories row
+    df_year_long["category"] = df_year_long["orig_cat_name"]
+
+    # replace cat names by codes in col "category"
+    # first the manual replacements
+    df_year_long["category"] = df_year_long["category"].str.replace("\n", "")
+
+    df_year_long["category"] = df_year_long["category"].replace(
+        inv_conf["cat_codes_manual"]
+    )
+
+    df_year_long["category"] = df_year_long["category"].str.replace(".", "")
+
+    # then the regex replacements
+    def repl(m):
+        return m.group("code")
+
+    df_year_long["category"] = df_year_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
+
+    df_year_long = df_year_long.reset_index(drop=True)
+
+    df_year_long["data"] = df_year_long["data"].str.replace(",", ".")
+
+    # TODO: I don't think there are NE1 in the tables.
+    # df_year_long["data"] = df_year_long["data"].str.replace("NE1", "NE")
+
+    # make sure all col headers are str
+    df_year_long.columns = df_year_long.columns.map(str)
+
+    df_year_long = df_year_long.drop(columns=["orig_cat_name"])
+
+    if df_all is None:
+        df_all = df_year_long
+    else:
+        df_all = pd.concat(
+            [df_all, df_year_long],
+            axis=0,
+            join="outer",
+        ).reset_index(drop=True)
+
+### convert to interchange format ###
+print("Converting to interchange format.")
+df_all_IF = pm2.pm2io.convert_long_dataframe_if(
+    df_all,
+    coords_cols=coords_cols,
+    coords_defaults=coords_defaults,
+    coords_terminologies=coords_terminologies,
+    coords_value_mapping=coords_value_mapping,
+    filter_remove=filter_remove,
+    meta_data=meta_data,
+    convert_str=True,
+    time_format="%Y",
+)
+
+
+### convert to primap2 format ###
+print("Converting to primap2 format.")
+data_pm2 = pm2.pm2io.from_interchange_format(df_all_IF)
+
+
+# ###
+# Save raw data to IF and native format.
+# ###
+
+data_if = data_pm2.pr.to_interchange_format()
+
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+    data_if,
+)
+
+encoding = {var: compression for var in data_pm2.data_vars}
+data_pm2.pr.to_netcdf(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+    encoding=encoding,
+)
+
+
+# ###
+# Processing
+# ###
+
+data_proc_pm2 = process_data_for_country(
+    data_country=data_pm2,
+    entities_to_ignore=[],
+    gas_baskets=gas_baskets,
+    filter_dims=None,
+    cat_terminology_out=None,
+    category_conversion=None,
+    sectors_out=None,
+    processing_info_country=country_processing_step1,
+)
+
+# ###
+# save processed data to IF and native format
+# ###
+
+terminology_proc = coords_terminologies["category"]
+
+data_proc_if = data_proc_pm2.pr.to_interchange_format()
+
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + terminology_proc), data_proc_if
+)
+
+encoding = {var: compression for var in data_proc_pm2.data_vars}
+data_proc_pm2.pr.to_netcdf(
+    output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+)
+
+print("Saved processed data.")

+ 2 - 1
UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.json

@@ -14,5 +14,6 @@
     "MYS": "Malaysia",
     "MNE": "Montenegro",
     "ISR": "Israel",
-    "IDN": "Indonesia"
+    "IDN": "Indonesia",
+    "BDI": "Burundi"
 }

+ 1 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/PK/xF/MD5E-s299244--f790b1166c54fd5b601264b6f563487e.csv/MD5E-s299244--f790b1166c54fd5b601264b6f563487e.csv

+ 1 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/J6/kq/MD5E-s280055--6e7d18a4d48652ecbd3d77c2800c8226.nc/MD5E-s280055--6e7d18a4d48652ecbd3d77c2800c8226.nc

+ 22 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP.yaml

@@ -0,0 +1,22 @@
+attrs:
+  references: https://unfccc.int/documents/611668
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Burundi. Biennial update report (BUR). BUR1 Processed on 2024-04-16
+  comment: Read fom pdf by Daniel Busch Processed on 2024-04-16
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - category (IPCC2006_PRIMAP)
+  - area (ISO3)
+  - source
+  - scenario (PRIMAP)
+  - provenance
+  - entity
+  - unit
+data_file: BDI_BUR1_2023_IPCC2006_PRIMAP.csv

+ 1 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP_raw.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/WF/7x/MD5E-s114782--4bbd01fb2bbed1cfa762c674e0447fbe.csv/MD5E-s114782--4bbd01fb2bbed1cfa762c674e0447fbe.csv

+ 1 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP_raw.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/xv/7q/MD5E-s111420--fcbfc2cd0e8adf22cbbebfda5ad57ef3.nc/MD5E-s111420--fcbfc2cd0e8adf22cbbebfda5ad57ef3.nc

+ 22 - 0
extracted_data/UNFCCC/Burundi/BDI_BUR1_2023_IPCC2006_PRIMAP_raw.yaml

@@ -0,0 +1,22 @@
+attrs:
+  references: https://unfccc.int/documents/611668
+  rights: ''
+  contact: daniel-busch@climate-resource.de
+  title: Burundi. Biennial update report (BUR). BUR1
+  comment: Read fom pdf by Daniel Busch
+  institution: UNFCCC
+  cat: category (IPCC2006_PRIMAP)
+  area: area (ISO3)
+  scen: scenario (PRIMAP)
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - category (IPCC2006_PRIMAP)
+  - area (ISO3)
+  - source
+  - scenario (PRIMAP)
+  - provenance
+  - entity
+  - unit
+data_file: BDI_BUR1_2023_IPCC2006_PRIMAP_raw.csv