Переглянути джерело

Add Peru BUR3 code and some modifications to functions

Johannes Gütschow 1 рік тому
батько
коміт
61639b9f1e

+ 560 - 0
UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py

@@ -0,0 +1,560 @@
+table_def_templates = {
+    "300": {  # 300
+        "area": ["69,457,727,78"],
+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de uso",
+                "Uso de productos sustitutos de las sustancias que",
+            ],
+            2: [
+                "1A Actividades de quema de combustible",
+                "2A Industria de los minerales",
+                "2B Industria química",
+                "2C Industria de los metales",
+                "2E Industria electrónica",
+                "3A Ganado",
+                "3A1 Fermentación entérica",
+            ],
+        },
+    },
+    "301": {  # 301
+        "area": ["72,542,727,99"],
+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
+        "rows_to_fix": {
+            3: [
+                "Fuentes agregadas y fuentes de emisión no CO2 de",
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            2: [
+                "3A2 Manejo del estiércol",
+                "3C1 Emisiones por quema de biomasa",
+                "3C3 Aplicación de urea",
+                "3C7 Cultivo de arroz",
+                "A Disposición de residuos sólidos",
+                "B Tratamiento biológico de residuos",
+                "C Incineración de residuos",
+                "D Tratamiento y descarga de aguas residuales",
+                "Búnker internacional",
+            ],
+        },
+    },
+    "302": {  # 302
+        "area": ["72,510,727,79"],
+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emision no CO2",
+            ],
+            -3: ["Total de las emisiones y remociones nacionales"],
+        },
+    },
+    "303": {  # 303
+        "area": ["72,540,727,127"],
+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            2: ["Aviación internacional"],
+        },
+    },
+    "304": {  # 304
+        "area": ["72,510,727,70"],
+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "305": {  # 305
+        "area": ["72,540,727,108"],
+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+    "306": {  # 306
+        "area": ["72,510,727,70"],
+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la",
+                "Productos no energéticos de combustibles y",
+                "Uso de productos sustitutos de las sustancias",
+                "Fuentes agregadas y fuentes de emisión no",
+            ],
+        },
+    },
+    "307": {  # 307
+        "area": ["72,540,727,108"],
+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA",
+            ],
+        },
+    },
+    "308": {  # 308
+        "area": ["72,510,727,70"],
+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "309": {  # 309
+        "area": ["72,540,727,117"],
+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+    "310": {  # 310
+        "area": ["72,510,727,70"],
+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "311": {  # 311
+        "area": ["72,540,727,110"],
+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            -2: ["Emisiones de CO2 de la biomasa"],
+        },
+    },
+    "312": {  # 312
+        "area": ["72,510,727,70"],
+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones fugitivas provenientes de la fabricación de",
+                "Productos no energéticos de combustibles y de uso de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2 de la",
+            ],
+        },
+    },
+    "313": {  # 313
+        "area": ["72,540,727,90"],
+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+}
+
+header = {
+    "entity": [
+        "Categorías de emisiones y sumideros de GEI",
+        "Emisiones/remociones netas de CO2",
+        "CH4",
+        "N2O",
+        "HFC",
+        "PFC",
+        "SF6",
+        "CO",
+        "NOx",
+        "COVDM",
+        "SOX",
+        "Emisiones/remociones totales de GEI",
+    ],
+    "unit": [
+        "",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+        "GgCO2eq",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+    ],
+}
+
+table_defs = {
+    "300": {
+        "templates": ["300"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4, 5],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2000,
+        "coords_value_mapping": "default",
+    },
+    "301": {
+        "templates": ["301"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4, 5],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2000,
+        "coords_value_mapping": "default",
+    },
+    "302": {
+        "templates": ["302"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2005,
+        "coords_value_mapping": "default",
+    },
+    "303": {
+        "templates": ["303"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2005,
+        "coords_value_mapping": "default",
+    },
+    "304": {
+        "templates": ["304"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2010,
+        "coords_value_mapping": "default",
+    },
+    "305": {
+        "templates": ["305"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2010,
+        "coords_value_mapping": "default",
+    },
+    "306": {
+        "templates": ["306"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2012,
+        "coords_value_mapping": "default",
+    },
+    "307": {
+        "templates": ["307"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2012,
+        "coords_value_mapping": "default",
+    },
+    "308": {
+        "templates": ["308"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2014,
+        "coords_value_mapping": "default",
+    },
+    "309": {
+        "templates": ["309"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2014,
+        "coords_value_mapping": "default",
+    },
+    "310": {
+        "templates": ["310"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2016,
+        "coords_value_mapping": "default",
+    },
+    "311": {
+        "templates": ["311"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2016,
+        "coords_value_mapping": "default",
+    },
+    "312": {
+        "templates": ["312"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2019,
+        "coords_value_mapping": "default",
+    },
+    "313": {
+        "templates": ["313"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2019,
+        "coords_value_mapping": "default",
+    },
+}
+
+cat_names_fix = {
+    "Industrias manufactureras y de la 1A2 construcción":
+        "1A2 Industrias manufactureras y de la construcción",
+    "Emisiones fugitivas provenientes de la fabricación 1B de combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Emisiones fugitivas provenientes de la 1B fabricación de combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Emisiones fugitivas provenientes de la fabricación de 1B combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Productos no energéticos de combustibles y de uso 2D de solventes":
+        "2D Productos no energéticos de combustibles y de uso de solventes",
+    "Productos no energéticos de combustibles y de 2D uso de solventes":
+        "2D Productos no energéticos de combustibles y de uso de solventes",
+    "Uso de productos sustitutos de las sustancias que 2F agotan la capa de ozono":
+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
+    "Uso de productos sustitutos de las sustancias 2F que agotan la capa de ozono":
+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
+    "Fuentes agregadas y fuentes de emisión no CO2 de 3C la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emision no CO2 3C de la tierra":
+        "3C Fuentes agregadas y fuentes de emision no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no CO2 3C de la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no 3C CO2 de la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no CO2 de la 3C tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Emisiones directas de N2O en suelos 3C4 gestionados":
+        "3C4 Emisiones directas de N2O en suelos gestionados",
+    "Emisiones indirectas de N2O en suelos 3C5 gestionados":
+        "3C5 Emisiones indirectas de N2O en suelos gestionados",
+    "Emisiones indirectas de N2O por manejo del 3C6 estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "Emisiones indirectas de N2O por manejo 3C6 del estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "Emisiones indirectas de N2O por 3C6 manejo del estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y 4 SILVICULTURA":
+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA 4 Y SILVICULTURA":
+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
+}
+
+values_replacement = {
+    #    '': '-',
+    " ": "",
+}
+
+gwp_to_use = "AR5GWP100"
+
+index_cols = ["orig_cat_name"]
+cols_for_space_stripping = index_cols
+
+unit_row = "header"
+
+## parameters part 2: conversion to PRIMAP2 interchnage format
+
+cats_remove = ["Partidas informativas"]
+
+cat_codes_manual = {
+    "Emisiones de CO2 de la biomasa": "M.BIO",
+    "Total de las emisiones y remociones nacionales": "0",
+    "Búnker internacional": "M.BK",
+    "Aviación internacional": "M.BK.A",
+    "Transporte marítimo y fluvial internacional": "M.BK.M",
+    "A Disposición de residuos sólidos": "5.A",
+    "B Tratamiento biológico de residuos": "5.B",
+    "C Incineración de residuos": "5.C",
+    "D Tratamiento y descarga de aguas residuales": "5.D",
+    "Tierras": "M.2006.3.B",
+}
+
+
+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
+
+# special header as category code and name in one column
+header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC1996_2006_PER_INV",
+    "scenario": "PRIMAP",
+}
+
+coords_terminologies_2006 = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+coords_defaults = {
+    "source": "PER-GHG-inventory ",
+    "provenance": "measured",
+    "area": "PER",
+    "scenario": "BUR3",
+}
+
+coords_value_mapping = {
+    "default": {
+        "unit": "PRIMAP1",
+        "entity": {
+            "Emisiones/remociones netas de CO2": "CO2",
+            "CH4": "CH4",
+            "N2O": "N2O",
+            "HFC": f"HFCS ({gwp_to_use})",
+            "PFC": f"PFCS ({gwp_to_use})",
+            "SF6": "SF6",
+            "CO": "CO",
+            "NOx": "NOX",
+            "COVDM": "NMVOC",
+            "SOx": "SOX",
+            "Emisiones/remociones totales de GEI": f"KYOTOGHG ({gwp_to_use})",
+        },
+    },
+}
+
+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
+
+add_coords_cols = {
+    "orig_cat_name": ["orig_cat_name", "category"],
+}
+
+filter_remove = {
+    # "f1" :{
+    #     "entity": ["HFC-125", "HFC-134a", "HFC-143a", "HFC-152a", "HFC-227ea",
+    #                "HFC-23", "HFC-32", "HFC-41", "HFC-43-10mee", "PFC-116",
+    #                "PFC-14", "PFC-218", "PFC-318", "NF3", "SF6"],
+    #     "category": "2"
+    # }
+}
+
+meta_data = {
+    "references": "https://unfccc.int/documents/",
+    "rights": "",
+    "contact": "mail@johannes-guetschow.de",
+    "title": "",
+    "comment": "Read fom pdf file by Johannes Gütschow",
+    "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
+}
+
+
+## processing
+cat_conversion = {
+    'mapping': {
+        '0': '0',
+        '1': '1',
+        '1.A': '1.A',
+        '1.A.1': '1.A.1',
+        '1.A.2': '1.A.2',
+        '1.A.3': '1.A.3',
+        '1.A.4': '1.A.4',
+        '1.A.5': '1.A.5',
+        '1.B': '1.B',
+        '1.B.1': '1.B.1',
+        '1.B.2': '1.B.2',
+        '2': '2',
+        '2.A': '2.A',
+        '2.B': '2.B',
+        '2.C': '2.C',
+        '2.D': '2.D',
+        '2.E': '2.E',
+        '2.F': '2.F',
+        '2.G': '2.G',
+        '2.H': '2.H',
+        '3': 'M.AG',
+        '3.A': '3.A',
+        '3.A.1': '3.A.1',
+        '3.A.2': '3.A.2',
+        '3.C': '3.C',
+        '3.C.1': '3.C.1',
+        '3.C.2': '3.C.2',
+        '3.C.3': '3.C.3',
+        '3.C.4': '3.C.4',
+        '3.C.5': '3.C.5',
+        '3.C.6': '3.C.6',
+        '3.C.7': '3.C.7',
+        '4': 'M.LULUCF',
+        'M.2006.3.B': '3.B',
+        '4.A': '3.B.1',
+        '4.B': '3.B.2',
+        '4.C': '3.B.3',
+        '4.D': '3.B.4',
+        '4.E': '3.B.5',
+        '4.F': '3.B.6',
+        '4.G': '3.D.1',
+        '5': '4',
+        '5.A': '4.A',
+        '5.B': '4.B',
+        '5.C': '4.C',
+        '5.D': '4.D',
+        'M.BK': 'M.BK',
+        'M.BK.A': 'M.BK.A',
+        'M.BK.M': 'M.BM.M',
+        'M.BIO': 'M.BIO',
+    },
+    'aggregate': {
+        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
+              'name': 'IPPU'},
+        'M.3.C.AG': {
+            'sources': ['3.C'],
+            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
+        'M.AG.ELV': {'sources': ['M.3.C.AG'],
+                     'name': 'Agriculture excluding livestock emissions'},
+        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
+        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    },
+}
+
+processing_info = {
+    'basket_copy': {
+        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        'entities': ["HFCS", "PFCS"],
+        'source_GWP': gwp_to_use,
+    },
+}

+ 290 - 0
UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py

@@ -0,0 +1,290 @@
+# read Singapore fifth BUR from pdf
+
+
+import camelot
+import primap2 as pm2
+import pandas as pd
+
+import locale
+
+from UNFCCC_GHG_data.helper import process_data_for_country, gas_baskets
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper import fix_rows
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+from config_PER_BUR3 import table_def_templates, table_defs, index_cols
+from config_PER_BUR3 import values_replacement, header_long, cats_remove
+from config_PER_BUR3 import cat_codes_manual, cat_code_regexp, cat_names_fix
+from config_PER_BUR3 import coords_cols, coords_terminologies, coords_defaults
+from config_PER_BUR3 import coords_terminologies_2006
+from config_PER_BUR3 import coords_value_mapping, meta_data, filter_remove
+from config_PER_BUR3 import processing_info, cat_conversion
+
+### general configuration
+input_folder = downloaded_data_path / "UNFCCC" / "Peru" / "BUR3"
+output_folder = extracted_data_path / "UNFCCC" / "Peru"
+if not output_folder.exists():
+    output_folder.mkdir()
+
+output_filename = "PER_BUR3_2023_"
+inventory_file_pdf = "Tercer_BUR_Per%C3%BA_Jun2023.pdf"
+# years_to_read = range(1990, 2018 + 1)
+
+# define locale to use for str to float conversion
+locale_to_use = "es_PE.UTF-8"
+locale.setlocale(locale.LC_NUMERIC, locale_to_use)
+
+pagesToRead = table_defs.keys()
+
+compression = dict(zlib=True, complevel=9)
+
+## part 1: read the data from pdf
+### part 1.a: 2016 inventory
+
+data_pm2 = None
+for page in pagesToRead:
+    print(f"++++++++++++++++++++++++++++++++")
+    print(f"+++++ Working on page {page} ++++++")
+    print(f"++++++++++++++++++++++++++++++++")
+
+    df_this_page = None
+    for table_on_page in table_defs[page]["templates"]:
+        print(f"Reading table {table_on_page}")
+        area = table_def_templates[table_on_page]["area"]
+        cols = table_def_templates[table_on_page]["cols"]
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file_pdf),
+            pages=str(page),
+            flavor="stream",
+            table_areas=area,
+            columns=cols,
+        )
+
+        df_current = tables[0].df.copy(deep=True)
+        # drop the old header
+        if "drop_rows" in table_defs[page].keys():
+            df_current = df_current.drop(table_defs[page]["drop_rows"])
+        elif "drop_rows" in table_def_templates[table_on_page].keys():
+            df_current = df_current.drop(
+                table_def_templates[table_on_page]["drop_rows"]
+            )
+        # add new header
+        if "header" in table_defs[page].keys():
+            df_current.columns = pd.MultiIndex.from_tuples(
+                zip(
+                    table_defs[page]["header"]["entity"],
+                    table_defs[page]["header"]["unit"],
+                )
+            )
+        else:
+            df_current.columns = pd.MultiIndex.from_tuples(
+                zip(
+                    table_def_templates[table_on_page]["header"]["entity"],
+                    table_def_templates[table_on_page]["header"]["unit"],
+                )
+            )
+
+        # drop cols if necessary
+        if "drop_cols" in table_defs[page].keys():
+            # print(df_current.columns.values)
+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
+        elif "drop_cols" in table_def_templates[table_on_page].keys():
+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
+
+        # rename category column
+        df_current.rename(
+            columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
+        )
+
+        # replace double \n
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
+        # replace double and triple spaces
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
+
+        # fix the split rows
+        for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
+            df_current = fix_rows(
+                df_current,
+                table_def_templates[table_on_page]["rows_to_fix"][n_rows],
+                index_cols[0],
+                n_rows,
+            )
+
+        # replace category names with typos
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
+
+        # replace empty stings
+        df_current = df_current.replace(values_replacement)
+
+        # set index
+        # df_current = df_current.set_index(index_cols)
+        # strip trailing and leading  and remove "^"
+        for col in df_current.columns.values:
+            df_current[col] = df_current[col].str.strip()
+            df_current[col] = df_current[col].str.replace("^", "")
+
+        # print(df_current)
+        # aggregate dfs for this page
+        if df_this_page is None:
+            df_this_page = df_current.copy(deep=True)
+        else:
+            # find intersecting cols
+            cols_this_page = df_this_page.columns.values
+            # print(f"cols this page: {cols_this_page}")
+            cols_current = df_current.columns.values
+            # print(f"cols current: {cols_current}")
+            cols_both = list(set(cols_this_page).intersection(set(cols_current)))
+            # print(f"cols both: {cols_both}")
+            if len(cols_both) > 0:
+                df_this_page = df_this_page.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
+            else:
+                df_this_page = df_this_page.merge(
+                    df_current,
+                    how="outer",
+                    left_index=True,
+                    right_index=True,
+                    suffixes=(None, None),
+                )
+
+            df_this_page = df_this_page.groupby(index_cols).first().reset_index()
+            # print(df_this_page)
+            # df_all = df_all.join(df_current, how='outer')
+
+    # set index and convert to long format
+    df_this_page = df_this_page.set_index(index_cols)
+    df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
+        df_this_page, table_defs[page]["year"], header_long
+    )
+
+    # drop the rows with memo items etc
+    for cat in cats_remove:
+        df_this_page_long = df_this_page_long.drop(
+            df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
+        )
+
+    # make a copy of the categories row
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
+
+    # replace cat names by codes in col "Categories"
+    # first the manual replacements
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
+        cat_codes_manual
+    )
+    # then the regex replacements
+    repl = lambda m: convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+        :, "category"
+    ].str.replace(cat_code_regexp, repl, regex=True)
+    df_this_page_long.loc[:, "category"].unique()
+
+    # strip spaces in data col
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.strip()
+
+    df_this_page_long = df_this_page_long.reset_index(drop=True)
+
+    # make sure all col headers are str
+    df_this_page_long.columns = df_this_page_long.columns.map(str)
+
+    # remove thousands separators as pd.to_numeric can't deal with that
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+        ".", ""
+    )
+    pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
+    repl = lambda m: f"{m.group('first')}.{m.group('last')}"
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+        pat, repl, regex=True
+    )
+
+    # df_this_page_long["data"] = df_this_page_long["data"].str.replace("^.$","",
+    #                                                                   regex=True)
+
+    # drop orig cat name as it's not unique over all tables (keep until here in case
+    # it's needed for debugging)
+    df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
+
+    data_page_if = pm2.pm2io.convert_long_dataframe_if(
+        df_this_page_long,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping[
+            table_defs[page]["coords_value_mapping"]
+        ],
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
+
+    # conversion to PRIMAP2 native format
+    data_page_pm2 = pm2.pm2io.from_interchange_format(data_page_if)
+
+    # combine with tables from other pages
+    if data_pm2 is None:
+        data_pm2 = data_page_pm2
+    else:
+        data_pm2 = data_pm2.pr.merge(data_page_pm2)
+
+# convert back to IF to have units in the fixed format
+data_if = data_pm2.pr.to_interchange_format()
+
+# ###
+# save data to IF and native format
+# ###
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+    data_if,
+)
+
+encoding = {var: compression for var in data_pm2.data_vars}
+data_pm2.pr.to_netcdf(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+    encoding=encoding,
+)
+
+#### continue here
+
+# ###
+# ## process the data
+# ###
+data_proc_pm2 = data_pm2
+
+# actual processing
+
+data_proc_pm2 = process_data_for_country(
+    data_proc_pm2,
+    entities_to_ignore=[],
+    gas_baskets=gas_baskets,
+    processing_info_country=processing_info,
+    cat_terminology_out=coords_terminologies_2006["category"],
+    category_conversion=cat_conversion,
+)
+
+# adapt source and metadata
+current_source = data_proc_pm2.coords["source"].values[0]
+data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
+data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
+
+# ###
+# save data to IF and native format
+# ###
+data_proc_if = data_proc_pm2.pr.to_interchange_format()
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + coords_terminologies_2006["category"]),
+    data_proc_if,
+)
+
+encoding = {var: compression for var in data_proc_pm2.data_vars}
+data_proc_pm2.pr.to_netcdf(
+    output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+    encoding=encoding,
+)

+ 340 - 232
UNFCCC_GHG_data/helper/functions.py

@@ -15,76 +15,84 @@ from .definitions import root_path, downloaded_data_path, extracted_data_path
 from .definitions import legacy_data_path, code_path
 from .definitions import GWP_factors
 
+
 def process_data_for_country(
-        data_country: xr.Dataset,
-        entities_to_ignore: List[str],
-        gas_baskets: Dict[str, List[str]],
-        filter_dims: Optional[Dict[str, List[str]]] = None,
-        cat_terminology_out: Optional[str] = None,
-        category_conversion: Dict[str, Dict] = None,
-        sectors_out: List[str] = None,
-        processing_info_country: Dict = None,
+    data_country: xr.Dataset,
+    entities_to_ignore: List[str],
+    gas_baskets: Dict[str, List[str]],
+    filter_dims: Optional[Dict[str, List[str]]] = None,
+    cat_terminology_out: Optional[str] = None,
+    category_conversion: Dict[str, Dict] = None,
+    sectors_out: List[str] = None,
+    processing_info_country: Dict = None,
 ) -> xr.Dataset:
     """
-        Process data from DI interface (where necessary).
-        * Downscaling including subtraction of time series
-        * country specific sector aggregation
-        * Conversion to IPCC2006 categories
-        * general sector and gas basket aggregation (in new categories)
+    Process data from DI interface (where necessary).
+    * Downscaling including subtraction of time series
+    * country specific sector aggregation
+    * Conversion to IPCC2006 categories
+    * general sector and gas basket aggregation (in new categories)
     """
 
     # 0: gather information
-    countries = list(data_country.coords[data_country.attrs['area']].values)
+    countries = list(data_country.coords[data_country.attrs["area"]].values)
     if len(countries) > 1:
         raise ValueError(
             f"Found {len(countries)} countries. Only single country data "
-            f"can be processed by this function. countries: {countries}")
+            f"can be processed by this function. countries: {countries}"
+        )
     else:
         country_code = countries[0]
 
     # get category terminology
-    cat_col = data_country.attrs['cat']
-    temp = re.findall(r'\((.*)\)', cat_col)
+    cat_col = data_country.attrs["cat"]
+    temp = re.findall(r"\((.*)\)", cat_col)
     cat_terminology_in = temp[0]
 
     # get scenario
-    scenarios = list(data_country.coords[data_country.attrs['scen']].values)
+    scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
     if len(scenarios) > 1:
         raise ValueError(
             f"Found {len(scenarios)} scenarios. Only single scenario data "
-            f"can be processed by this function. Scenarios: {scenarios}")
+            f"can be processed by this function. Scenarios: {scenarios}"
+        )
     scenario = scenarios[0]
 
     # get source
-    sources = list(data_country.coords['source'].values)
+    sources = list(data_country.coords["source"].values)
     if len(sources) > 1:
         raise ValueError(
             f"Found {len(sources)} sources. Only single source data "
-            f"can be processed by this function. Sources: {sources}")
+            f"can be processed by this function. Sources: {sources}"
+        )
     source = sources[0]
 
     # check if category name column present
     # TODO: replace 'name' in config by  'additional_cols' dict that defines the cols
     #  and the values
-    if 'orig_cat_name' in data_country.coords:
+    if "orig_cat_name" in data_country.coords:
         cat_name_present = True
     else:
         cat_name_present = False
 
     # 1: general processing
     # remove unused cats
-    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
+    data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
     # remove unused years
-    data_country = data_country.dropna(f'time', how='all')
+    data_country = data_country.dropna(f"time", how="all")
     # remove variables only containing nan
-    nan_vars_country = [var for var in data_country.data_vars if
-                        bool(data_country[var].isnull().all().data) is True]
+    nan_vars_country = [
+        var
+        for var in data_country.data_vars
+        if bool(data_country[var].isnull().all().data) is True
+    ]
     print(f"removing all-nan variables: {nan_vars_country}")
     data_country = data_country.drop_vars(nan_vars_country)
 
     # remove unnecessary variables
-    entities_ignore_present = [entity for entity in entities_to_ignore if
-                               entity in data_country.data_vars]
+    entities_ignore_present = [
+        entity for entity in entities_to_ignore if entity in data_country.data_vars
+    ]
     data_country = data_country.drop_vars(entities_ignore_present)
 
     # filter ()
@@ -93,167 +101,200 @@ def process_data_for_country(
 
     # 2: country specific processing
     if processing_info_country is not None:
-
-        if 'tolerance' in processing_info_country:
+        if "tolerance" in processing_info_country:
             tolerance = processing_info_country["tolerance"]
         else:
             tolerance = 0.01
 
         # remove entities if needed
-        if 'ignore_entities' in processing_info_country:
-            entities_to_ignore_country = processing_info_country[
-                'ignore_entities']
-            entities_ignore_present = \
-                [entity for entity in entities_to_ignore_country if
-                 entity in data_country.data_vars]
+        if "ignore_entities" in processing_info_country:
+            entities_to_ignore_country = processing_info_country["ignore_entities"]
+            entities_ignore_present = [
+                entity
+                for entity in entities_to_ignore_country
+                if entity in data_country.data_vars
+            ]
             data_country = data_country.drop_vars(entities_ignore_present)
 
         # take only desired years
-        if 'years' in processing_info_country:
+        if "years" in processing_info_country:
             data_country = data_country.pr.loc[
-                {'time': processing_info_country['years']}]
+                {"time": processing_info_country["years"]}
+            ]
 
         # remove timeseries if desired
-        if 'remove_ts' in processing_info_country:
-            for case in processing_info_country['remove_ts']:
-                remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
+        if "remove_ts" in processing_info_country:
+            for case in processing_info_country["remove_ts"]:
+                remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
                 entities = remove_info.pop("entities")
                 for entity in entities:
-                    data_country[entity].pr.loc[remove_info] = \
+                    data_country[entity].pr.loc[remove_info] = (
                         data_country[entity].pr.loc[remove_info] * np.nan
+                    )
 
         # remove all data for given years if necessary
-        if 'remove_years' in processing_info_country:
+        if "remove_years" in processing_info_country:
             data_country = data_country.drop_sel(
-                time=processing_info_country['remove_years'])
+                time=processing_info_country["remove_years"]
+            )
 
         # subtract categories
-        if 'subtract_cats' in processing_info_country:
-            subtract_cats_current = processing_info_country['subtract_cats']
+        if "subtract_cats" in processing_info_country:
+            subtract_cats_current = processing_info_country["subtract_cats"]
             print(f"Subtracting categories for country {country_code}")
             for cat_to_generate in subtract_cats_current:
-                if 'entities' in subtract_cats_current[cat_to_generate].keys():
-                    entities_current = subtract_cats_current[cat_to_generate]['entities']
+                if "entities" in subtract_cats_current[cat_to_generate].keys():
+                    entities_current = subtract_cats_current[cat_to_generate][
+                        "entities"
+                    ]
                 else:
                     entities_current = list(data_country.data_vars)
 
-                cats_to_subtract = \
-                    subtract_cats_current[cat_to_generate]['subtract']
-                data_sub = \
-                    data_country[entities_current].pr.loc[
-                        {'category': cats_to_subtract}].pr.sum(
-                        dim='category', skipna=True, min_count=1)
+                cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
+                data_sub = (
+                    data_country[entities_current]
+                    .pr.loc[{"category": cats_to_subtract}]
+                    .pr.sum(dim="category", skipna=True, min_count=1)
+                )
                 data_parent = data_country[entities_current].pr.loc[
-                    {'category': subtract_cats_current[cat_to_generate]['parent']}]
+                    {"category": subtract_cats_current[cat_to_generate]["parent"]}
+                ]
                 data_agg = data_parent - data_sub
-                nan_vars = [var for var in data_agg.data_vars if
-                            data_agg[var].isnull().all().data is True]
+                nan_vars = [
+                    var
+                    for var in data_agg.data_vars
+                    if data_agg[var].isnull().all().data is True
+                ]
                 data_agg = data_agg.drop(nan_vars)
                 if len(data_agg.data_vars) > 0:
                     print(f"Generating {cat_to_generate} through subtraction")
-                    data_agg = data_agg.expand_dims([f'category ('
-                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.expand_dims(
+                        [f"category (" f"{cat_terminology_in})"]
+                    )
 
                     data_agg = data_agg.assign_coords(
-                        coords={f'category ({cat_terminology_in})':
-                                    (f'category ({cat_terminology_in})',
-                                     [cat_to_generate])})
+                        coords={
+                            f"category ({cat_terminology_in})": (
+                                f"category ({cat_terminology_in})",
+                                [cat_to_generate],
+                            )
+                        }
+                    )
                     if cat_name_present:
-                        cat_name = subtract_cats_current[cat_to_generate]['name']
+                        cat_name = subtract_cats_current[cat_to_generate]["name"]
                         data_agg = data_agg.assign_coords(
-                            coords={'orig_cat_name':
-                                        (f'category ({cat_terminology_in})',
-                                         [cat_name])})
-                    data_country = data_country.pr.merge(data_agg,
-                                                         tolerance=tolerance)
+                            coords={
+                                "orig_cat_name": (
+                                    f"category ({cat_terminology_in})",
+                                    [cat_name],
+                                )
+                            }
+                        )
+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
                 else:
                     print(f"no data to generate category {cat_to_generate}")
 
         # downscaling
-        if 'downscale' in processing_info_country:
-            if 'sectors' in processing_info_country['downscale']:
-                sector_downscaling = \
-                    processing_info_country['downscale']['sectors']
+        if "downscale" in processing_info_country:
+            if "sectors" in processing_info_country["downscale"]:
+                sector_downscaling = processing_info_country["downscale"]["sectors"]
                 for case in sector_downscaling.keys():
                     print(f"Downscaling for {case}.")
                     sector_downscaling_current = sector_downscaling[case]
-                    entities = sector_downscaling_current.pop('entities')
+                    entities = sector_downscaling_current.pop("entities")
                     for entity in entities:
                         data_country[entity] = data_country[
-                            entity].pr.downscale_timeseries(
-                            **sector_downscaling_current)
+                            entity
+                        ].pr.downscale_timeseries(**sector_downscaling_current)
                         # , skipna_evaluation_dims=None)
 
-            if 'entities' in processing_info_country['downscale']:
-                entity_downscaling = \
-                    processing_info_country['downscale']['entities']
+            if "entities" in processing_info_country["downscale"]:
+                entity_downscaling = processing_info_country["downscale"]["entities"]
                 for case in entity_downscaling.keys():
                     print(f"Downscaling for {case}.")
                     # print(data_country.coords[f'category ('
                     #                          f'{cat_terminology_in})'].values)
                     data_country = data_country.pr.downscale_gas_timeseries(
-                        **entity_downscaling[case], skipna=True,
-                        skipna_evaluation_dims=None)
+                        **entity_downscaling[case],
+                        skipna=True,
+                        skipna_evaluation_dims=None,
+                    )
 
         # aggregate categories
-        if 'aggregate_cats' in processing_info_country:
-            if 'agg_tolerance' in processing_info_country:
-                agg_tolerance = processing_info_country['agg_tolerance']
+        if "aggregate_cats" in processing_info_country:
+            if "agg_tolerance" in processing_info_country:
+                agg_tolerance = processing_info_country["agg_tolerance"]
             else:
                 agg_tolerance = tolerance
-            aggregate_cats_current = processing_info_country['aggregate_cats']
+            aggregate_cats_current = processing_info_country["aggregate_cats"]
             print(
                 f"Aggregating categories for country {country_code}, source {source}, "
-                f"scenario {scenario}")
+                f"scenario {scenario}"
+            )
             for cat_to_agg in aggregate_cats_current:
                 print(f"Category: {cat_to_agg}")
-                source_cats = aggregate_cats_current[cat_to_agg]['sources']
-                data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
-                    dim='category', skipna=True, min_count=1)
-                nan_vars = [var for var in data_agg.data_vars if
-                            data_agg[var].isnull().all().data is True]
+                source_cats = aggregate_cats_current[cat_to_agg]["sources"]
+                data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
+                    dim="category", skipna=True, min_count=1
+                )
+                nan_vars = [
+                    var
+                    for var in data_agg.data_vars
+                    if data_agg[var].isnull().all().data is True
+                ]
                 data_agg = data_agg.drop(nan_vars)
                 if len(data_agg.data_vars) > 0:
-                    data_agg = data_agg.expand_dims([f'category ('
-                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.expand_dims(
+                        [f"category (" f"{cat_terminology_in})"]
+                    )
                     data_agg = data_agg.assign_coords(
-                        coords={f'category ({cat_terminology_in})':
-                                    (f'category ({cat_terminology_in})',
-                                     [cat_to_agg])})
+                        coords={
+                            f"category ({cat_terminology_in})": (
+                                f"category ({cat_terminology_in})",
+                                [cat_to_agg],
+                            )
+                        }
+                    )
                     if cat_name_present:
-                        cat_name = aggregate_cats_current[cat_to_agg]['name']
+                        cat_name = aggregate_cats_current[cat_to_agg]["name"]
                         data_agg = data_agg.assign_coords(
-                            coords={'orig_cat_name':
-                                        (f'category ({cat_terminology_in})',
-                                         [cat_name])})
-                    data_country = data_country.pr.merge(data_agg,
-                                                         tolerance=agg_tolerance)
+                            coords={
+                                "orig_cat_name": (
+                                    f"category ({cat_terminology_in})",
+                                    [cat_name],
+                                )
+                            }
+                        )
+                    data_country = data_country.pr.merge(
+                        data_agg, tolerance=agg_tolerance
+                    )
                 else:
                     print(f"no data to aggregate category {cat_to_agg}")
 
         # copy HFCs and PFCs with default factors
-        if 'basket_copy' in processing_info_country:
+        if "basket_copy" in processing_info_country:
             GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
             entities = processing_info_country["basket_copy"]["entities"]
             source_GWP = processing_info_country["basket_copy"]["source_GWP"]
             for entity in entities:
-                data_source = data_country[f'{entity} ({source_GWP})']
+                data_source = data_country[f"{entity} ({source_GWP})"]
                 for GWP in GWPs_to_add:
-                    data_GWP = data_source * \
-                               GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
+                    data_GWP = (
+                        data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
+                    )
                     data_GWP.attrs["entity"] = entity
                     data_GWP.attrs["gwp_context"] = GWP
                     data_country[f"{entity} ({GWP})"] = data_GWP
 
         # aggregate gases if desired
-        if 'aggregate_gases' in processing_info_country:
+        if "aggregate_gases" in processing_info_country:
             # TODO: why use different code here than below. Can this fill non-existen
             #  gas baskets?
-            for case in processing_info_country['aggregate_gases'].keys():
-                case_info = processing_info_country['aggregate_gases'][case]
-                data_country[case_info['basket']] = \
-                    data_country.pr.fill_na_gas_basket_from_contents(
-                        **case_info)
+            for case in processing_info_country["aggregate_gases"].keys():
+                case_info = processing_info_country["aggregate_gases"][case]
+                data_country[
+                    case_info["basket"]
+                ] = data_country.pr.fill_na_gas_basket_from_contents(**case_info)
 
     # 3: map categories
     if category_conversion is not None:
@@ -270,61 +311,74 @@ def process_data_for_country(
     # more general processing
     # reduce categories to output cats
     if sectors_out is not None:
-        cats_to_keep = [cat for cat in
-                        data_country.coords[f'category ({cat_terminology_out})'].values
-                        if cat in sectors_out]
-        data_country = data_country.pr.loc[{'category': cats_to_keep}]
+        cats_to_keep = [
+            cat
+            for cat in data_country.coords[f"category ({cat_terminology_out})"].values
+            if cat in sectors_out
+        ]
+        data_country = data_country.pr.loc[{"category": cats_to_keep}]
 
     # create gas baskets
     entities_present = set(data_country.data_vars)
     for basket in gas_baskets.keys():
-        basket_contents_present = [gas for gas in gas_baskets[basket] if
-                                   gas in entities_present]
+        basket_contents_present = [
+            gas for gas in gas_baskets[basket] if gas in entities_present
+        ]
         if len(basket_contents_present) > 0:
             if basket in list(data_country.data_vars):
                 data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
-                    basket=basket, basket_contents=basket_contents_present,
-                    skipna=True, min_count=1)
+                    basket=basket,
+                    basket_contents=basket_contents_present,
+                    skipna=True,
+                    min_count=1,
+                )
             else:
                 try:
-                    #print(data_country.data_vars)
-                    data_country[basket] = xr.full_like(data_country["CO2"],
-                                                        np.nan).pr.quantify(
-                        units="Gg CO2 / year")
-                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
-                                                  "gwp_context": basket.split(' ')[1][
-                                                                 1:-1]}
+                    # print(data_country.data_vars)
+                    data_country[basket] = xr.full_like(
+                        data_country["CO2"], np.nan
+                    ).pr.quantify(units="Gg CO2 / year")
+                    data_country[basket].attrs = {
+                        "entity": basket.split(" ")[0],
+                        "gwp_context": basket.split(" ")[1][1:-1],
+                    }
                     data_country[basket] = data_country.pr.gas_basket_contents_sum(
-                        basket=basket, basket_contents=basket_contents_present,
-                        min_count=1)
+                        basket=basket,
+                        basket_contents=basket_contents_present,
+                        min_count=1,
+                    )
                     entities_present.add(basket)
                 except Exception as ex:
-                    print(f"No gas basket created for {country_code}, {source}, "
-                          f"{scenario}: {ex}")
+                    print(
+                        f"No gas basket created for {country_code}, {source}, "
+                        f"{scenario}: {ex}"
+                    )
 
     # amend title and comment
-    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
-                                                                    f"{date.today()}"
-    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
-                                                                    f"{date.today()}"
+    data_country.attrs["comment"] = (
+        data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
+    )
+    data_country.attrs["title"] = (
+        data_country.attrs["title"] + f" Processed on " f"{date.today()}"
+    )
 
     return data_country
 
 
 def convert_categories(
-        ds_input: xr.Dataset,
-        conversion: Dict[str, Dict[str, str]],
-        #terminology_from: str,
-        terminology_to: str,
-        debug: bool=False,
-        tolerance: float=0.01,
-)->xr.Dataset:
+    ds_input: xr.Dataset,
+    conversion: Dict[str, Dict[str, str]],
+    # terminology_from: str,
+    terminology_to: str,
+    debug: bool = False,
+    tolerance: float = 0.01,
+) -> xr.Dataset:
     """
     convert data from one category terminology to another
     """
     print(f"converting categories to {terminology_to}")
 
-    if 'orig_cat_name' in ds_input.coords:
+    if "orig_cat_name" in ds_input.coords:
         cat_name_present = True
     else:
         cat_name_present = False
@@ -338,50 +392,67 @@ def convert_categories(
     ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
 
     # find categories present in dataset
-    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
+    cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
 
     # restrict categories and map category names
-    if 'mapping' in conversion.keys():
-        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
-                                cat in cats_present]
-        ds_converted = ds_converted.pr.loc[
-            {'category': mapping_cats_present}]
-
-        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
-        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
-        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
-                                                   (f'category ({terminology_to})',
-                                                    to_cats)})
+    if "mapping" in conversion.keys():
+        mapping_cats_present = [
+            cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
+        ]
+        ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
+
+        from_cats = ds_converted.coords[f"category ({terminology_to})"].values
+        to_cats = pd.Series(from_cats).replace(conversion["mapping"])
+        ds_converted = ds_converted.assign_coords(
+            {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
+        )
 
     # redo the list of present cats after mapping, as we have new categories in the
     # target terminology now
-    cats_present_mapped = list(ds_converted.coords[f'category ('
-                                                   f'{terminology_to})'].values)
+    cats_present_mapped = list(
+        ds_converted.coords[f"category (" f"{terminology_to})"].values
+    )
     # aggregate categories
-    if 'aggregate' in conversion:
-        aggregate_cats = conversion['aggregate']
+    if "aggregate" in conversion:
+        aggregate_cats = conversion["aggregate"]
         for cat_to_agg in aggregate_cats:
             if debug:
                 print(f"Category: {cat_to_agg}")
-            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
-                           cat in cats_present_mapped]
+            source_cats = [
+                cat
+                for cat in aggregate_cats[cat_to_agg]["sources"]
+                if cat in cats_present_mapped
+            ]
             if debug:
                 print(source_cats)
-            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
-                dim='category', skipna=True, min_count=1)
-            nan_vars = [var for var in data_agg.data_vars if
-                        data_agg[var].isnull().all().data == True]
+            data_agg = ds_converted.pr.loc[{"category": source_cats}].pr.sum(
+                dim="category", skipna=True, min_count=1
+            )
+            nan_vars = [
+                var
+                for var in data_agg.data_vars
+                if data_agg[var].isnull().all().data == True
+            ]
             data_agg = data_agg.drop(nan_vars)
             if len(data_agg.data_vars) > 0:
-                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
+                data_agg = data_agg.expand_dims([f"category ({terminology_to})"])
                 data_agg = data_agg.assign_coords(
-                    coords={f'category ({terminology_to})':
-                                (f'category ({terminology_to})', [cat_to_agg])})
+                    coords={
+                        f"category ({terminology_to})": (
+                            f"category ({terminology_to})",
+                            [cat_to_agg],
+                        )
+                    }
+                )
                 if cat_name_present:
                     data_agg = data_agg.assign_coords(
-                        coords={'orig_cat_name':
-                                    (f'category ({terminology_to})',
-                                     [aggregate_cats[cat_to_agg]['name']])})
+                        coords={
+                            "orig_cat_name": (
+                                f"category ({terminology_to})",
+                                [aggregate_cats[cat_to_agg]["name"]],
+                            )
+                        }
+                    )
                 ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
                 cats_present_mapped.append(cat_to_agg)
             else:
@@ -391,9 +462,9 @@ def convert_categories(
 
 
 def get_country_name(
-        country_code: str,
+    country_code: str,
 ) -> str:
-    """get country name from code """
+    """get country name from code"""
     if country_code in custom_country_mapping:
         country_name = custom_country_mapping[country_code]
     else:
@@ -401,15 +472,16 @@ def get_country_name(
             country = pycountry.countries.get(alpha_3=country_code)
             country_name = country.name
         except:
-            raise ValueError(f"Country code {country_code} can not be mapped to "
-                             f"any country")
+            raise ValueError(
+                f"Country code {country_code} can not be mapped to " f"any country"
+            )
 
     return country_name
 
 
 def get_country_code(
-        country_name: str,
-)->str:
+    country_name: str,
+) -> str:
     """
     obtain country code. If the input is a code it will be returned,
     if the input
@@ -435,28 +507,31 @@ def get_country_code(
             country_code = country.alpha_3
         except:
             try:
-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
+                country = pycountry.countries.search_fuzzy(
+                    country_name.replace("_", " ")
+                )
             except:
-                raise ValueError(f"Country name {country_name} can not be mapped to "
-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
+                raise ValueError(
+                    f"Country name {country_name} can not be mapped to "
+                    f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly."
+                )
             if len(country) > 1:
                 country_code = None
                 for current_country in country:
                     if current_country.name == country_name:
                         country_code = current_country.alpha_3
                 if country_code is None:
-                    raise ValueError(f"Country name {country_name} has {len(country)} "
-                                     f"possible results for country codes.")
+                    raise ValueError(
+                        f"Country name {country_name} has {len(country)} "
+                        f"possible results for country codes."
+                    )
 
             country_code = country[0].alpha_3
 
     return country_code
 
 
-def create_folder_mapping(
-        folder: str,
-        extracted: bool = False
-) -> None:
+def create_folder_mapping(folder: str, extracted: bool = False) -> None:
     """
     Create a mapping from 3 letter ISO country codes to folders
     based on the subfolders of the given folder. The mapping is
@@ -480,9 +555,9 @@ def create_folder_mapping(
 
     folder = root_path / folder
     folder_mapping = {}
-    #if not extracted:
+    # if not extracted:
     known_folders = custom_folders
-    #else:
+    # else:
     #    known_folders = {}
 
     for item in folder.iterdir():
@@ -491,7 +566,9 @@ def create_folder_mapping(
                 ISO3 = known_folders[item.name]
             else:
                 try:
-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
+                    country = pycountry.countries.search_fuzzy(
+                        item.name.replace("_", " ")
+                    )
                     if len(country) > 1:
                         ISO3 = None
                         for current_country in country:
@@ -516,8 +593,8 @@ def create_folder_mapping(
 
 # TODO add crf
 def get_country_submissions(
-        country_name: str,
-        print_sub: bool = True,
+    country_name: str,
+    print_sub: bool = True,
 ) -> Dict[str, List[str]]:
     """
     Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
@@ -585,8 +662,8 @@ def get_country_submissions(
 
 
 def get_country_datasets(
-        country_name: str,
-        print_ds: bool = True,
+    country_name: str,
+    print_ds: bool = True,
 ) -> Dict[str, List[str]]:
     """
     Input is a three letter ISO code for a country, or the country's name.
@@ -638,35 +715,42 @@ def get_country_datasets(
             else:
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping json file. Should be str."
+                    )
 
                 datasets_current_folder = {}
                 current_folder = item / country_folder
 
                 for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
                         if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                            datasets_current_folder[data_file.stem].append(
+                                data_file.suffix
+                            )
                         else:
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
 
                 for dataset in datasets_current_folder:
                     # process filename to get submission
-                    parts = dataset.split('_')
+                    parts = dataset.split("_")
                     if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
-                            dataset
+                        cleaned_datasets_current_folder[
+                            f"Wrong code: {parts[0]}"
+                        ] = dataset
                     else:
-                        terminology = "_".join(parts[3 : ])
+                        terminology = "_".join(parts[3:])
                         key = f"{parts[1]} ({parts[2]}, {terminology})"
                         data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
+                        if ".nc" in datasets_current_folder[dataset]:
                             data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                        if (".csv" in datasets_current_folder[dataset]) and (
+                            ".yaml" in datasets_current_folder[dataset]
+                        ):
                             data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
+                        elif ".csv" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
+                        elif ".yaml" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF (.yaml), "
 
                         code_file = get_code_file(country_code, parts[1])
@@ -680,7 +764,9 @@ def get_country_datasets(
                 if print_ds:
                     if cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                            print(
+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
+                            )
                     else:
                         print("No data available")
                     print("")
@@ -708,34 +794,42 @@ def get_country_datasets(
             else:
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping json file. Should be str."
+                    )
 
                 datasets_current_folder = {}
                 current_folder = item / country_folder
 
                 for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
                         if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                            datasets_current_folder[data_file.stem].append(
+                                data_file.suffix
+                            )
                         else:
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
 
                 for dataset in datasets_current_folder:
                     # process filename to get submission
-                    parts = dataset.split('_')
+                    parts = dataset.split("_")
                     if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
+                        cleaned_datasets_current_folder[
+                            f"Wrong UNFCCC_GHG_data: {parts[0]}"
+                        ] = dataset
                     else:
-                        terminology = "_".join(parts[3 : ])
+                        terminology = "_".join(parts[3:])
                         key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
                         data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
+                        if ".nc" in datasets_current_folder[dataset]:
                             data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                        if (".csv" in datasets_current_folder[dataset]) and (
+                            ".yaml" in datasets_current_folder[dataset]
+                        ):
                             data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
+                        elif ".csv" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
+                        elif ".yaml" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF (.yaml), "
 
                         cleaned_datasets_current_folder[key] = data_info
@@ -743,7 +837,9 @@ def get_country_datasets(
                 if print_ds:
                     if cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                            print(
+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
+                            )
                     else:
                         print("No data available")
                     print("")
@@ -759,9 +855,9 @@ def get_country_datasets(
 
 
 def get_code_file(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
 ) -> Path:
     """
     For given country name and submission find the script that creates the data
@@ -813,13 +909,17 @@ def get_code_file(
         for file in country_folder.iterdir():
             if file.match(code_file_name_candidate):
                 if code_file_path is not None:
-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
-                                     f"{code_file_path} and file.name. "
-                                     f"Please use only one file with name "
-                                     f"'read_ISO3_submission_XXX.YYY'.")
+                    raise ValueError(
+                        f"Found multiple UNFCCC_GHG_data candidates: "
+                        f"{code_file_path} and file.name. "
+                        f"Please use only one file with name "
+                        f"'read_ISO3_submission_XXX.YYY'."
+                    )
                 else:
                     if print_info:
-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
+                        print(
+                            f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}"
+                        )
                 code_file_path = file
 
     if code_file_path is not None:
@@ -828,8 +928,10 @@ def get_code_file(
         return None
 
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
-    '''
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
     Function to fix rows that have been split during reading from pdf
     This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
 
@@ -838,18 +940,20 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
     :param col_to_use:
     :param n_rows:
     :return:
-    '''
+    """
     for row in rows_to_fix:
-        #print(row)
+        # print(row)
         # find the row number and collect the row and the next two rows
         index = data.loc[data[col_to_use] == row].index
-        #print(list(index))
+        # print(list(index))
         if not list(index):
             print(f"Can't merge split row {row}")
             print(data[col_to_use])
-        #print(f"Merging split row {row} for table {page}")
+        # print(f"Merging split row {row} for table {page}")
         loc = data.index.get_loc(index[0])
-        if n_rows == -3:
+        if n_rows == -2:
+            locs_to_merge = list(range(loc - 1, loc + 1))
+        elif n_rows == -3:
             locs_to_merge = list(range(loc - 1, loc + 2))
         elif n_rows == -5:
             locs_to_merge = list(range(loc - 1, loc + 4))
@@ -858,7 +962,7 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         rows_to_merge = data.iloc[locs_to_merge]
         indices_to_merge = rows_to_merge.index
         # join the three rows
-        new_row = rows_to_merge.agg(' '.join)
+        new_row = rows_to_merge.agg(" ".join)
         # replace the double spaces that are created
         # must be done here and not at the end as splits are not always
         # the same and join would produce different col values
@@ -866,6 +970,10 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         new_row = new_row.str.replace("N O", "NO")
         new_row = new_row.str.replace(", N", ",N")
         new_row = new_row.str.replace("- ", "-")
+        # replace spaces in numbers
+        pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
+        repl = lambda m: f"{m.group('first')}{m.group('last')}"
+        new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
-    return data
+    return data

+ 3 - 0
pyproject.toml

@@ -6,3 +6,6 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[tool.black]
+line-length = 88
+

+ 2 - 1
setup.cfg

@@ -30,7 +30,7 @@ packages =
     UNFCCC_GHG_data.UNFCCC_downloader
     UNFCCC_GHG_data.UNFCCC_DI_reader
     UNFCCC_GHG_data.helper
-    #UNFCCC_GHG_data.datasets
+#UNFCCC_GHG_data.datasets
 python_requires = >=3.8
 setup_requires =
     setuptools_scm
@@ -70,6 +70,7 @@ dev =
     jupyter
     dask
     ipympl
+    black
 
 
 [options.package_data]