Browse Source

Add Peru BUR3 code and some modifications to functions

Johannes Gütschow 1 year ago
parent
commit
61639b9f1e

+ 560 - 0
UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py

@@ -0,0 +1,560 @@
+table_def_templates = {
+    "300": {  # 300
+        "area": ["69,457,727,78"],
+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de uso",
+                "Uso de productos sustitutos de las sustancias que",
+            ],
+            2: [
+                "1A Actividades de quema de combustible",
+                "2A Industria de los minerales",
+                "2B Industria química",
+                "2C Industria de los metales",
+                "2E Industria electrónica",
+                "3A Ganado",
+                "3A1 Fermentación entérica",
+            ],
+        },
+    },
+    "301": {  # 301
+        "area": ["72,542,727,99"],
+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
+        "rows_to_fix": {
+            3: [
+                "Fuentes agregadas y fuentes de emisión no CO2 de",
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            2: [
+                "3A2 Manejo del estiércol",
+                "3C1 Emisiones por quema de biomasa",
+                "3C3 Aplicación de urea",
+                "3C7 Cultivo de arroz",
+                "A Disposición de residuos sólidos",
+                "B Tratamiento biológico de residuos",
+                "C Incineración de residuos",
+                "D Tratamiento y descarga de aguas residuales",
+                "Búnker internacional",
+            ],
+        },
+    },
+    "302": {  # 302
+        "area": ["72,510,727,79"],
+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emision no CO2",
+            ],
+            -3: ["Total de las emisiones y remociones nacionales"],
+        },
+    },
+    "303": {  # 303
+        "area": ["72,540,727,127"],
+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            2: ["Aviación internacional"],
+        },
+    },
+    "304": {  # 304
+        "area": ["72,510,727,70"],
+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "305": {  # 305
+        "area": ["72,540,727,108"],
+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+    "306": {  # 306
+        "area": ["72,510,727,70"],
+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la",
+                "Productos no energéticos de combustibles y",
+                "Uso de productos sustitutos de las sustancias",
+                "Fuentes agregadas y fuentes de emisión no",
+            ],
+        },
+    },
+    "307": {  # 307
+        "area": ["72,540,727,108"],
+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA",
+            ],
+        },
+    },
+    "308": {  # 308
+        "area": ["72,510,727,70"],
+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "309": {  # 309
+        "area": ["72,540,727,117"],
+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+    "310": {  # 310
+        "area": ["72,510,727,70"],
+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Industrias manufactureras y de la",
+                "Emisiones fugitivas provenientes de la fabricación",
+                "Productos no energéticos de combustibles y de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2",
+            ],
+        },
+    },
+    "311": {  # 311
+        "area": ["72,540,727,110"],
+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones directas de N2O en suelos",
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+            -2: ["Emisiones de CO2 de la biomasa"],
+        },
+    },
+    "312": {  # 312
+        "area": ["72,510,727,70"],
+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones fugitivas provenientes de la fabricación de",
+                "Productos no energéticos de combustibles y de uso de",
+                "Uso de productos sustitutos de las sustancias que",
+                "Fuentes agregadas y fuentes de emisión no CO2 de la",
+            ],
+        },
+    },
+    "313": {  # 313
+        "area": ["72,540,727,90"],
+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
+        "rows_to_fix": {
+            3: [
+                "Emisiones indirectas de N2O en suelos",
+                "Emisiones indirectas de N2O por manejo del",
+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
+            ],
+        },
+    },
+}
+
+header = {
+    "entity": [
+        "Categorías de emisiones y sumideros de GEI",
+        "Emisiones/remociones netas de CO2",
+        "CH4",
+        "N2O",
+        "HFC",
+        "PFC",
+        "SF6",
+        "CO",
+        "NOx",
+        "COVDM",
+        "SOX",
+        "Emisiones/remociones totales de GEI",
+    ],
+    "unit": [
+        "",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+        "GgCO2eq",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "Gg",
+        "GgCO2eq",
+    ],
+}
+
+table_defs = {
+    "300": {
+        "templates": ["300"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4, 5],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2000,
+        "coords_value_mapping": "default",
+    },
+    "301": {
+        "templates": ["301"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4, 5],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2000,
+        "coords_value_mapping": "default",
+    },
+    "302": {
+        "templates": ["302"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2005,
+        "coords_value_mapping": "default",
+    },
+    "303": {
+        "templates": ["303"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2005,
+        "coords_value_mapping": "default",
+    },
+    "304": {
+        "templates": ["304"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2010,
+        "coords_value_mapping": "default",
+    },
+    "305": {
+        "templates": ["305"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2010,
+        "coords_value_mapping": "default",
+    },
+    "306": {
+        "templates": ["306"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2012,
+        "coords_value_mapping": "default",
+    },
+    "307": {
+        "templates": ["307"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2012,
+        "coords_value_mapping": "default",
+    },
+    "308": {
+        "templates": ["308"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2014,
+        "coords_value_mapping": "default",
+    },
+    "309": {
+        "templates": ["309"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2014,
+        "coords_value_mapping": "default",
+    },
+    "310": {
+        "templates": ["310"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2016,
+        "coords_value_mapping": "default",
+    },
+    "311": {
+        "templates": ["311"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2016,
+        "coords_value_mapping": "default",
+    },
+    "312": {
+        "templates": ["312"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2019,
+        "coords_value_mapping": "default",
+    },
+    "313": {
+        "templates": ["313"],
+        "header": header,
+        "drop_rows": [0, 1, 2, 3, 4],
+        "category_col": "Categorías de emisiones y sumideros de GEI",
+        "year": 2019,
+        "coords_value_mapping": "default",
+    },
+}
+
+cat_names_fix = {
+    "Industrias manufactureras y de la 1A2 construcción":
+        "1A2 Industrias manufactureras y de la construcción",
+    "Emisiones fugitivas provenientes de la fabricación 1B de combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Emisiones fugitivas provenientes de la 1B fabricación de combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Emisiones fugitivas provenientes de la fabricación de 1B combustibles":
+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
+    "Productos no energéticos de combustibles y de uso 2D de solventes":
+        "2D Productos no energéticos de combustibles y de uso de solventes",
+    "Productos no energéticos de combustibles y de 2D uso de solventes":
+        "2D Productos no energéticos de combustibles y de uso de solventes",
+    "Uso de productos sustitutos de las sustancias que 2F agotan la capa de ozono":
+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
+    "Uso de productos sustitutos de las sustancias 2F que agotan la capa de ozono":
+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
+    "Fuentes agregadas y fuentes de emisión no CO2 de 3C la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emision no CO2 3C de la tierra":
+        "3C Fuentes agregadas y fuentes de emision no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no CO2 3C de la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no 3C CO2 de la tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Fuentes agregadas y fuentes de emisión no CO2 de la 3C tierra":
+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
+    "Emisiones directas de N2O en suelos 3C4 gestionados":
+        "3C4 Emisiones directas de N2O en suelos gestionados",
+    "Emisiones indirectas de N2O en suelos 3C5 gestionados":
+        "3C5 Emisiones indirectas de N2O en suelos gestionados",
+    "Emisiones indirectas de N2O por manejo del 3C6 estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "Emisiones indirectas de N2O por manejo 3C6 del estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "Emisiones indirectas de N2O por 3C6 manejo del estiércol":
+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y 4 SILVICULTURA":
+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA 4 Y SILVICULTURA":
+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
+}
+
+values_replacement = {
+    #    '': '-',
+    " ": "",
+}
+
+gwp_to_use = "AR5GWP100"
+
+index_cols = ["orig_cat_name"]
+cols_for_space_stripping = index_cols
+
+unit_row = "header"
+
+## parameters part 2: conversion to PRIMAP2 interchnage format
+
+cats_remove = ["Partidas informativas"]
+
+cat_codes_manual = {
+    "Emisiones de CO2 de la biomasa": "M.BIO",
+    "Total de las emisiones y remociones nacionales": "0",
+    "Búnker internacional": "M.BK",
+    "Aviación internacional": "M.BK.A",
+    "Transporte marítimo y fluvial internacional": "M.BK.M",
+    "A Disposición de residuos sólidos": "5.A",
+    "B Tratamiento biológico de residuos": "5.B",
+    "C Incineración de residuos": "5.C",
+    "D Tratamiento y descarga de aguas residuales": "5.D",
+    "Tierras": "M.2006.3.B",
+}
+
+
+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
+
+# special header as category code and name in one column
+header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
+
+coords_terminologies = {
+    "area": "ISO3",
+    "category": "IPCC1996_2006_PER_INV",
+    "scenario": "PRIMAP",
+}
+
+coords_terminologies_2006 = {
+    "area": "ISO3",
+    "category": "IPCC2006_PRIMAP",
+    "scenario": "PRIMAP",
+}
+
+coords_defaults = {
+    "source": "PER-GHG-inventory ",
+    "provenance": "measured",
+    "area": "PER",
+    "scenario": "BUR3",
+}
+
+coords_value_mapping = {
+    "default": {
+        "unit": "PRIMAP1",
+        "entity": {
+            "Emisiones/remociones netas de CO2": "CO2",
+            "CH4": "CH4",
+            "N2O": "N2O",
+            "HFC": f"HFCS ({gwp_to_use})",
+            "PFC": f"PFCS ({gwp_to_use})",
+            "SF6": "SF6",
+            "CO": "CO",
+            "NOx": "NOX",
+            "COVDM": "NMVOC",
+            "SOx": "SOX",
+            "Emisiones/remociones totales de GEI": f"KYOTOGHG ({gwp_to_use})",
+        },
+    },
+}
+
+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
+
+add_coords_cols = {
+    "orig_cat_name": ["orig_cat_name", "category"],
+}
+
+filter_remove = {
+    # "f1" :{
+    #     "entity": ["HFC-125", "HFC-134a", "HFC-143a", "HFC-152a", "HFC-227ea",
+    #                "HFC-23", "HFC-32", "HFC-41", "HFC-43-10mee", "PFC-116",
+    #                "PFC-14", "PFC-218", "PFC-318", "NF3", "SF6"],
+    #     "category": "2"
+    # }
+}
+
+meta_data = {
+    "references": "https://unfccc.int/documents/",
+    "rights": "",
+    "contact": "mail@johannes-guetschow.de",
+    "title": "",
+    "comment": "Read fom pdf file by Johannes Gütschow",
+    "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
+}
+
+
+## processing
+cat_conversion = {
+    'mapping': {
+        '0': '0',
+        '1': '1',
+        '1.A': '1.A',
+        '1.A.1': '1.A.1',
+        '1.A.2': '1.A.2',
+        '1.A.3': '1.A.3',
+        '1.A.4': '1.A.4',
+        '1.A.5': '1.A.5',
+        '1.B': '1.B',
+        '1.B.1': '1.B.1',
+        '1.B.2': '1.B.2',
+        '2': '2',
+        '2.A': '2.A',
+        '2.B': '2.B',
+        '2.C': '2.C',
+        '2.D': '2.D',
+        '2.E': '2.E',
+        '2.F': '2.F',
+        '2.G': '2.G',
+        '2.H': '2.H',
+        '3': 'M.AG',
+        '3.A': '3.A',
+        '3.A.1': '3.A.1',
+        '3.A.2': '3.A.2',
+        '3.C': '3.C',
+        '3.C.1': '3.C.1',
+        '3.C.2': '3.C.2',
+        '3.C.3': '3.C.3',
+        '3.C.4': '3.C.4',
+        '3.C.5': '3.C.5',
+        '3.C.6': '3.C.6',
+        '3.C.7': '3.C.7',
+        '4': 'M.LULUCF',
+        'M.2006.3.B': '3.B',
+        '4.A': '3.B.1',
+        '4.B': '3.B.2',
+        '4.C': '3.B.3',
+        '4.D': '3.B.4',
+        '4.E': '3.B.5',
+        '4.F': '3.B.6',
+        '4.G': '3.D.1',
+        '5': '4',
+        '5.A': '4.A',
+        '5.B': '4.B',
+        '5.C': '4.C',
+        '5.D': '4.D',
+        'M.BK': 'M.BK',
+        'M.BK.A': 'M.BK.A',
+        'M.BK.M': 'M.BM.M',
+        'M.BIO': 'M.BIO',
+    },
+    'aggregate': {
+        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
+              'name': 'IPPU'},
+        'M.3.C.AG': {
+            'sources': ['3.C'],
+            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
+        'M.AG.ELV': {'sources': ['M.3.C.AG'],
+                     'name': 'Agriculture excluding livestock emissions'},
+        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
+        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    },
+}
+
+processing_info = {
+    'basket_copy': {
+        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        'entities': ["HFCS", "PFCS"],
+        'source_GWP': gwp_to_use,
+    },
+}

+ 290 - 0
UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py

@@ -0,0 +1,290 @@
+# read Singapore fifth BUR from pdf
+
+
+import camelot
+import primap2 as pm2
+import pandas as pd
+
+import locale
+
+from UNFCCC_GHG_data.helper import process_data_for_country, gas_baskets
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper import fix_rows
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+from config_PER_BUR3 import table_def_templates, table_defs, index_cols
+from config_PER_BUR3 import values_replacement, header_long, cats_remove
+from config_PER_BUR3 import cat_codes_manual, cat_code_regexp, cat_names_fix
+from config_PER_BUR3 import coords_cols, coords_terminologies, coords_defaults
+from config_PER_BUR3 import coords_terminologies_2006
+from config_PER_BUR3 import coords_value_mapping, meta_data, filter_remove
+from config_PER_BUR3 import processing_info, cat_conversion
+
+### general configuration
+input_folder = downloaded_data_path / "UNFCCC" / "Peru" / "BUR3"
+output_folder = extracted_data_path / "UNFCCC" / "Peru"
+if not output_folder.exists():
+    output_folder.mkdir()
+
+output_filename = "PER_BUR3_2023_"
+inventory_file_pdf = "Tercer_BUR_Per%C3%BA_Jun2023.pdf"
+# years_to_read = range(1990, 2018 + 1)
+
+# define locale to use for str to float conversion
+locale_to_use = "es_PE.UTF-8"
+locale.setlocale(locale.LC_NUMERIC, locale_to_use)
+
+pagesToRead = table_defs.keys()
+
+compression = dict(zlib=True, complevel=9)
+
+## part 1: read the data from pdf
+### part 1.a: 2016 inventory
+
+data_pm2 = None
+for page in pagesToRead:
+    print(f"++++++++++++++++++++++++++++++++")
+    print(f"+++++ Working on page {page} ++++++")
+    print(f"++++++++++++++++++++++++++++++++")
+
+    df_this_page = None
+    for table_on_page in table_defs[page]["templates"]:
+        print(f"Reading table {table_on_page}")
+        area = table_def_templates[table_on_page]["area"]
+        cols = table_def_templates[table_on_page]["cols"]
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file_pdf),
+            pages=str(page),
+            flavor="stream",
+            table_areas=area,
+            columns=cols,
+        )
+
+        df_current = tables[0].df.copy(deep=True)
+        # drop the old header
+        if "drop_rows" in table_defs[page].keys():
+            df_current = df_current.drop(table_defs[page]["drop_rows"])
+        elif "drop_rows" in table_def_templates[table_on_page].keys():
+            df_current = df_current.drop(
+                table_def_templates[table_on_page]["drop_rows"]
+            )
+        # add new header
+        if "header" in table_defs[page].keys():
+            df_current.columns = pd.MultiIndex.from_tuples(
+                zip(
+                    table_defs[page]["header"]["entity"],
+                    table_defs[page]["header"]["unit"],
+                )
+            )
+        else:
+            df_current.columns = pd.MultiIndex.from_tuples(
+                zip(
+                    table_def_templates[table_on_page]["header"]["entity"],
+                    table_def_templates[table_on_page]["header"]["unit"],
+                )
+            )
+
+        # drop cols if necessary
+        if "drop_cols" in table_defs[page].keys():
+            # print(df_current.columns.values)
+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
+        elif "drop_cols" in table_def_templates[table_on_page].keys():
+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
+
+        # rename category column
+        df_current.rename(
+            columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
+        )
+
+        # replace double \n
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
+        # replace double and triple spaces
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
+
+        # fix the split rows
+        for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
+            df_current = fix_rows(
+                df_current,
+                table_def_templates[table_on_page]["rows_to_fix"][n_rows],
+                index_cols[0],
+                n_rows,
+            )
+
+        # replace category names with typos
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
+
+        # replace empty stings
+        df_current = df_current.replace(values_replacement)
+
+        # set index
+        # df_current = df_current.set_index(index_cols)
+        # strip trailing and leading  and remove "^"
+        for col in df_current.columns.values:
+            df_current[col] = df_current[col].str.strip()
+            df_current[col] = df_current[col].str.replace("^", "")
+
+        # print(df_current)
+        # aggregate dfs for this page
+        if df_this_page is None:
+            df_this_page = df_current.copy(deep=True)
+        else:
+            # find intersecting cols
+            cols_this_page = df_this_page.columns.values
+            # print(f"cols this page: {cols_this_page}")
+            cols_current = df_current.columns.values
+            # print(f"cols current: {cols_current}")
+            cols_both = list(set(cols_this_page).intersection(set(cols_current)))
+            # print(f"cols both: {cols_both}")
+            if len(cols_both) > 0:
+                df_this_page = df_this_page.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
+            else:
+                df_this_page = df_this_page.merge(
+                    df_current,
+                    how="outer",
+                    left_index=True,
+                    right_index=True,
+                    suffixes=(None, None),
+                )
+
+            df_this_page = df_this_page.groupby(index_cols).first().reset_index()
+            # print(df_this_page)
+            # df_all = df_all.join(df_current, how='outer')
+
+    # set index and convert to long format
+    df_this_page = df_this_page.set_index(index_cols)
+    df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
+        df_this_page, table_defs[page]["year"], header_long
+    )
+
+    # drop the rows with memo items etc
+    for cat in cats_remove:
+        df_this_page_long = df_this_page_long.drop(
+            df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
+        )
+
+    # make a copy of the categories row
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
+
+    # replace cat names by codes in col "Categories"
+    # first the manual replacements
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
+        cat_codes_manual
+    )
+    # then the regex replacements
+    repl = lambda m: convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+        :, "category"
+    ].str.replace(cat_code_regexp, repl, regex=True)
+    df_this_page_long.loc[:, "category"].unique()
+
+    # strip spaces in data col
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.strip()
+
+    df_this_page_long = df_this_page_long.reset_index(drop=True)
+
+    # make sure all col headers are str
+    df_this_page_long.columns = df_this_page_long.columns.map(str)
+
+    # remove thousands separators as pd.to_numeric can't deal with that
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+        ".", ""
+    )
+    pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
+    repl = lambda m: f"{m.group('first')}.{m.group('last')}"
+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+        pat, repl, regex=True
+    )
+
+    # df_this_page_long["data"] = df_this_page_long["data"].str.replace("^.$","",
+    #                                                                   regex=True)
+
+    # drop orig cat name as it's not unique over all tables (keep until here in case
+    # it's needed for debugging)
+    df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
+
+    data_page_if = pm2.pm2io.convert_long_dataframe_if(
+        df_this_page_long,
+        coords_cols=coords_cols,
+        # add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping[
+            table_defs[page]["coords_value_mapping"]
+        ],
+        # coords_value_filling=coords_value_filling,
+        filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
+
+    # conversion to PRIMAP2 native format
+    data_page_pm2 = pm2.pm2io.from_interchange_format(data_page_if)
+
+    # combine with tables from other pages
+    if data_pm2 is None:
+        data_pm2 = data_page_pm2
+    else:
+        data_pm2 = data_pm2.pr.merge(data_page_pm2)
+
+# convert back to IF to have units in the fixed format
+data_if = data_pm2.pr.to_interchange_format()
+
+# ###
+# save data to IF and native format
+# ###
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+    data_if,
+)
+
+encoding = {var: compression for var in data_pm2.data_vars}
+data_pm2.pr.to_netcdf(
+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+    encoding=encoding,
+)
+
+#### continue here
+
+# ###
+# ## process the data
+# ###
+data_proc_pm2 = data_pm2
+
+# actual processing
+
+data_proc_pm2 = process_data_for_country(
+    data_proc_pm2,
+    entities_to_ignore=[],
+    gas_baskets=gas_baskets,
+    processing_info_country=processing_info,
+    cat_terminology_out=coords_terminologies_2006["category"],
+    category_conversion=cat_conversion,
+)
+
+# adapt source and metadata
+current_source = data_proc_pm2.coords["source"].values[0]
+data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
+data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
+
+# ###
+# save data to IF and native format
+# ###
+data_proc_if = data_proc_pm2.pr.to_interchange_format()
+if not output_folder.exists():
+    output_folder.mkdir()
+pm2.pm2io.write_interchange_format(
+    output_folder / (output_filename + coords_terminologies_2006["category"]),
+    data_proc_if,
+)
+
+encoding = {var: compression for var in data_proc_pm2.data_vars}
+data_proc_pm2.pr.to_netcdf(
+    output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+    encoding=encoding,
+)

+ 340 - 232
UNFCCC_GHG_data/helper/functions.py

@@ -15,76 +15,84 @@ from .definitions import root_path, downloaded_data_path, extracted_data_path
 from .definitions import legacy_data_path, code_path
 from .definitions import legacy_data_path, code_path
 from .definitions import GWP_factors
 from .definitions import GWP_factors
 
 
+
 def process_data_for_country(
 def process_data_for_country(
-        data_country: xr.Dataset,
-        entities_to_ignore: List[str],
-        gas_baskets: Dict[str, List[str]],
-        filter_dims: Optional[Dict[str, List[str]]] = None,
-        cat_terminology_out: Optional[str] = None,
-        category_conversion: Dict[str, Dict] = None,
-        sectors_out: List[str] = None,
-        processing_info_country: Dict = None,
+    data_country: xr.Dataset,
+    entities_to_ignore: List[str],
+    gas_baskets: Dict[str, List[str]],
+    filter_dims: Optional[Dict[str, List[str]]] = None,
+    cat_terminology_out: Optional[str] = None,
+    category_conversion: Dict[str, Dict] = None,
+    sectors_out: List[str] = None,
+    processing_info_country: Dict = None,
 ) -> xr.Dataset:
 ) -> xr.Dataset:
     """
     """
-        Process data from DI interface (where necessary).
-        * Downscaling including subtraction of time series
-        * country specific sector aggregation
-        * Conversion to IPCC2006 categories
-        * general sector and gas basket aggregation (in new categories)
+    Process data from DI interface (where necessary).
+    * Downscaling including subtraction of time series
+    * country specific sector aggregation
+    * Conversion to IPCC2006 categories
+    * general sector and gas basket aggregation (in new categories)
     """
     """
 
 
     # 0: gather information
     # 0: gather information
-    countries = list(data_country.coords[data_country.attrs['area']].values)
+    countries = list(data_country.coords[data_country.attrs["area"]].values)
     if len(countries) > 1:
     if len(countries) > 1:
         raise ValueError(
         raise ValueError(
             f"Found {len(countries)} countries. Only single country data "
             f"Found {len(countries)} countries. Only single country data "
-            f"can be processed by this function. countries: {countries}")
+            f"can be processed by this function. countries: {countries}"
+        )
     else:
     else:
         country_code = countries[0]
         country_code = countries[0]
 
 
     # get category terminology
     # get category terminology
-    cat_col = data_country.attrs['cat']
-    temp = re.findall(r'\((.*)\)', cat_col)
+    cat_col = data_country.attrs["cat"]
+    temp = re.findall(r"\((.*)\)", cat_col)
     cat_terminology_in = temp[0]
     cat_terminology_in = temp[0]
 
 
     # get scenario
     # get scenario
-    scenarios = list(data_country.coords[data_country.attrs['scen']].values)
+    scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
     if len(scenarios) > 1:
     if len(scenarios) > 1:
         raise ValueError(
         raise ValueError(
             f"Found {len(scenarios)} scenarios. Only single scenario data "
             f"Found {len(scenarios)} scenarios. Only single scenario data "
-            f"can be processed by this function. Scenarios: {scenarios}")
+            f"can be processed by this function. Scenarios: {scenarios}"
+        )
     scenario = scenarios[0]
     scenario = scenarios[0]
 
 
     # get source
     # get source
-    sources = list(data_country.coords['source'].values)
+    sources = list(data_country.coords["source"].values)
     if len(sources) > 1:
     if len(sources) > 1:
         raise ValueError(
         raise ValueError(
             f"Found {len(sources)} sources. Only single source data "
             f"Found {len(sources)} sources. Only single source data "
-            f"can be processed by this function. Sources: {sources}")
+            f"can be processed by this function. Sources: {sources}"
+        )
     source = sources[0]
     source = sources[0]
 
 
     # check if category name column present
     # check if category name column present
     # TODO: replace 'name' in config by  'additional_cols' dict that defines the cols
     # TODO: replace 'name' in config by  'additional_cols' dict that defines the cols
     #  and the values
     #  and the values
-    if 'orig_cat_name' in data_country.coords:
+    if "orig_cat_name" in data_country.coords:
         cat_name_present = True
         cat_name_present = True
     else:
     else:
         cat_name_present = False
         cat_name_present = False
 
 
     # 1: general processing
     # 1: general processing
     # remove unused cats
     # remove unused cats
-    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
+    data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
     # remove unused years
     # remove unused years
-    data_country = data_country.dropna(f'time', how='all')
+    data_country = data_country.dropna(f"time", how="all")
     # remove variables only containing nan
     # remove variables only containing nan
-    nan_vars_country = [var for var in data_country.data_vars if
-                        bool(data_country[var].isnull().all().data) is True]
+    nan_vars_country = [
+        var
+        for var in data_country.data_vars
+        if bool(data_country[var].isnull().all().data) is True
+    ]
     print(f"removing all-nan variables: {nan_vars_country}")
     print(f"removing all-nan variables: {nan_vars_country}")
     data_country = data_country.drop_vars(nan_vars_country)
     data_country = data_country.drop_vars(nan_vars_country)
 
 
     # remove unnecessary variables
     # remove unnecessary variables
-    entities_ignore_present = [entity for entity in entities_to_ignore if
-                               entity in data_country.data_vars]
+    entities_ignore_present = [
+        entity for entity in entities_to_ignore if entity in data_country.data_vars
+    ]
     data_country = data_country.drop_vars(entities_ignore_present)
     data_country = data_country.drop_vars(entities_ignore_present)
 
 
     # filter ()
     # filter ()
@@ -93,167 +101,200 @@ def process_data_for_country(
 
 
     # 2: country specific processing
     # 2: country specific processing
     if processing_info_country is not None:
     if processing_info_country is not None:
-
-        if 'tolerance' in processing_info_country:
+        if "tolerance" in processing_info_country:
             tolerance = processing_info_country["tolerance"]
             tolerance = processing_info_country["tolerance"]
         else:
         else:
             tolerance = 0.01
             tolerance = 0.01
 
 
         # remove entities if needed
         # remove entities if needed
-        if 'ignore_entities' in processing_info_country:
-            entities_to_ignore_country = processing_info_country[
-                'ignore_entities']
-            entities_ignore_present = \
-                [entity for entity in entities_to_ignore_country if
-                 entity in data_country.data_vars]
+        if "ignore_entities" in processing_info_country:
+            entities_to_ignore_country = processing_info_country["ignore_entities"]
+            entities_ignore_present = [
+                entity
+                for entity in entities_to_ignore_country
+                if entity in data_country.data_vars
+            ]
             data_country = data_country.drop_vars(entities_ignore_present)
             data_country = data_country.drop_vars(entities_ignore_present)
 
 
         # take only desired years
         # take only desired years
-        if 'years' in processing_info_country:
+        if "years" in processing_info_country:
             data_country = data_country.pr.loc[
             data_country = data_country.pr.loc[
-                {'time': processing_info_country['years']}]
+                {"time": processing_info_country["years"]}
+            ]
 
 
         # remove timeseries if desired
         # remove timeseries if desired
-        if 'remove_ts' in processing_info_country:
-            for case in processing_info_country['remove_ts']:
-                remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
+        if "remove_ts" in processing_info_country:
+            for case in processing_info_country["remove_ts"]:
+                remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
                 entities = remove_info.pop("entities")
                 entities = remove_info.pop("entities")
                 for entity in entities:
                 for entity in entities:
-                    data_country[entity].pr.loc[remove_info] = \
+                    data_country[entity].pr.loc[remove_info] = (
                         data_country[entity].pr.loc[remove_info] * np.nan
                         data_country[entity].pr.loc[remove_info] * np.nan
+                    )
 
 
         # remove all data for given years if necessary
         # remove all data for given years if necessary
-        if 'remove_years' in processing_info_country:
+        if "remove_years" in processing_info_country:
             data_country = data_country.drop_sel(
             data_country = data_country.drop_sel(
-                time=processing_info_country['remove_years'])
+                time=processing_info_country["remove_years"]
+            )
 
 
         # subtract categories
         # subtract categories
-        if 'subtract_cats' in processing_info_country:
-            subtract_cats_current = processing_info_country['subtract_cats']
+        if "subtract_cats" in processing_info_country:
+            subtract_cats_current = processing_info_country["subtract_cats"]
             print(f"Subtracting categories for country {country_code}")
             print(f"Subtracting categories for country {country_code}")
             for cat_to_generate in subtract_cats_current:
             for cat_to_generate in subtract_cats_current:
-                if 'entities' in subtract_cats_current[cat_to_generate].keys():
-                    entities_current = subtract_cats_current[cat_to_generate]['entities']
+                if "entities" in subtract_cats_current[cat_to_generate].keys():
+                    entities_current = subtract_cats_current[cat_to_generate][
+                        "entities"
+                    ]
                 else:
                 else:
                     entities_current = list(data_country.data_vars)
                     entities_current = list(data_country.data_vars)
 
 
-                cats_to_subtract = \
-                    subtract_cats_current[cat_to_generate]['subtract']
-                data_sub = \
-                    data_country[entities_current].pr.loc[
-                        {'category': cats_to_subtract}].pr.sum(
-                        dim='category', skipna=True, min_count=1)
+                cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
+                data_sub = (
+                    data_country[entities_current]
+                    .pr.loc[{"category": cats_to_subtract}]
+                    .pr.sum(dim="category", skipna=True, min_count=1)
+                )
                 data_parent = data_country[entities_current].pr.loc[
                 data_parent = data_country[entities_current].pr.loc[
-                    {'category': subtract_cats_current[cat_to_generate]['parent']}]
+                    {"category": subtract_cats_current[cat_to_generate]["parent"]}
+                ]
                 data_agg = data_parent - data_sub
                 data_agg = data_parent - data_sub
-                nan_vars = [var for var in data_agg.data_vars if
-                            data_agg[var].isnull().all().data is True]
+                nan_vars = [
+                    var
+                    for var in data_agg.data_vars
+                    if data_agg[var].isnull().all().data is True
+                ]
                 data_agg = data_agg.drop(nan_vars)
                 data_agg = data_agg.drop(nan_vars)
                 if len(data_agg.data_vars) > 0:
                 if len(data_agg.data_vars) > 0:
                     print(f"Generating {cat_to_generate} through subtraction")
                     print(f"Generating {cat_to_generate} through subtraction")
-                    data_agg = data_agg.expand_dims([f'category ('
-                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.expand_dims(
+                        [f"category (" f"{cat_terminology_in})"]
+                    )
 
 
                     data_agg = data_agg.assign_coords(
                     data_agg = data_agg.assign_coords(
-                        coords={f'category ({cat_terminology_in})':
-                                    (f'category ({cat_terminology_in})',
-                                     [cat_to_generate])})
+                        coords={
+                            f"category ({cat_terminology_in})": (
+                                f"category ({cat_terminology_in})",
+                                [cat_to_generate],
+                            )
+                        }
+                    )
                     if cat_name_present:
                     if cat_name_present:
-                        cat_name = subtract_cats_current[cat_to_generate]['name']
+                        cat_name = subtract_cats_current[cat_to_generate]["name"]
                         data_agg = data_agg.assign_coords(
                         data_agg = data_agg.assign_coords(
-                            coords={'orig_cat_name':
-                                        (f'category ({cat_terminology_in})',
-                                         [cat_name])})
-                    data_country = data_country.pr.merge(data_agg,
-                                                         tolerance=tolerance)
+                            coords={
+                                "orig_cat_name": (
+                                    f"category ({cat_terminology_in})",
+                                    [cat_name],
+                                )
+                            }
+                        )
+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
                 else:
                 else:
                     print(f"no data to generate category {cat_to_generate}")
                     print(f"no data to generate category {cat_to_generate}")
 
 
         # downscaling
         # downscaling
-        if 'downscale' in processing_info_country:
-            if 'sectors' in processing_info_country['downscale']:
-                sector_downscaling = \
-                    processing_info_country['downscale']['sectors']
+        if "downscale" in processing_info_country:
+            if "sectors" in processing_info_country["downscale"]:
+                sector_downscaling = processing_info_country["downscale"]["sectors"]
                 for case in sector_downscaling.keys():
                 for case in sector_downscaling.keys():
                     print(f"Downscaling for {case}.")
                     print(f"Downscaling for {case}.")
                     sector_downscaling_current = sector_downscaling[case]
                     sector_downscaling_current = sector_downscaling[case]
-                    entities = sector_downscaling_current.pop('entities')
+                    entities = sector_downscaling_current.pop("entities")
                     for entity in entities:
                     for entity in entities:
                         data_country[entity] = data_country[
                         data_country[entity] = data_country[
-                            entity].pr.downscale_timeseries(
-                            **sector_downscaling_current)
+                            entity
+                        ].pr.downscale_timeseries(**sector_downscaling_current)
                         # , skipna_evaluation_dims=None)
                         # , skipna_evaluation_dims=None)
 
 
-            if 'entities' in processing_info_country['downscale']:
-                entity_downscaling = \
-                    processing_info_country['downscale']['entities']
+            if "entities" in processing_info_country["downscale"]:
+                entity_downscaling = processing_info_country["downscale"]["entities"]
                 for case in entity_downscaling.keys():
                 for case in entity_downscaling.keys():
                     print(f"Downscaling for {case}.")
                     print(f"Downscaling for {case}.")
                     # print(data_country.coords[f'category ('
                     # print(data_country.coords[f'category ('
                     #                          f'{cat_terminology_in})'].values)
                     #                          f'{cat_terminology_in})'].values)
                     data_country = data_country.pr.downscale_gas_timeseries(
                     data_country = data_country.pr.downscale_gas_timeseries(
-                        **entity_downscaling[case], skipna=True,
-                        skipna_evaluation_dims=None)
+                        **entity_downscaling[case],
+                        skipna=True,
+                        skipna_evaluation_dims=None,
+                    )
 
 
         # aggregate categories
         # aggregate categories
-        if 'aggregate_cats' in processing_info_country:
-            if 'agg_tolerance' in processing_info_country:
-                agg_tolerance = processing_info_country['agg_tolerance']
+        if "aggregate_cats" in processing_info_country:
+            if "agg_tolerance" in processing_info_country:
+                agg_tolerance = processing_info_country["agg_tolerance"]
             else:
             else:
                 agg_tolerance = tolerance
                 agg_tolerance = tolerance
-            aggregate_cats_current = processing_info_country['aggregate_cats']
+            aggregate_cats_current = processing_info_country["aggregate_cats"]
             print(
             print(
                 f"Aggregating categories for country {country_code}, source {source}, "
                 f"Aggregating categories for country {country_code}, source {source}, "
-                f"scenario {scenario}")
+                f"scenario {scenario}"
+            )
             for cat_to_agg in aggregate_cats_current:
             for cat_to_agg in aggregate_cats_current:
                 print(f"Category: {cat_to_agg}")
                 print(f"Category: {cat_to_agg}")
-                source_cats = aggregate_cats_current[cat_to_agg]['sources']
-                data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
-                    dim='category', skipna=True, min_count=1)
-                nan_vars = [var for var in data_agg.data_vars if
-                            data_agg[var].isnull().all().data is True]
+                source_cats = aggregate_cats_current[cat_to_agg]["sources"]
+                data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
+                    dim="category", skipna=True, min_count=1
+                )
+                nan_vars = [
+                    var
+                    for var in data_agg.data_vars
+                    if data_agg[var].isnull().all().data is True
+                ]
                 data_agg = data_agg.drop(nan_vars)
                 data_agg = data_agg.drop(nan_vars)
                 if len(data_agg.data_vars) > 0:
                 if len(data_agg.data_vars) > 0:
-                    data_agg = data_agg.expand_dims([f'category ('
-                                                     f'{cat_terminology_in})'])
+                    data_agg = data_agg.expand_dims(
+                        [f"category (" f"{cat_terminology_in})"]
+                    )
                     data_agg = data_agg.assign_coords(
                     data_agg = data_agg.assign_coords(
-                        coords={f'category ({cat_terminology_in})':
-                                    (f'category ({cat_terminology_in})',
-                                     [cat_to_agg])})
+                        coords={
+                            f"category ({cat_terminology_in})": (
+                                f"category ({cat_terminology_in})",
+                                [cat_to_agg],
+                            )
+                        }
+                    )
                     if cat_name_present:
                     if cat_name_present:
-                        cat_name = aggregate_cats_current[cat_to_agg]['name']
+                        cat_name = aggregate_cats_current[cat_to_agg]["name"]
                         data_agg = data_agg.assign_coords(
                         data_agg = data_agg.assign_coords(
-                            coords={'orig_cat_name':
-                                        (f'category ({cat_terminology_in})',
-                                         [cat_name])})
-                    data_country = data_country.pr.merge(data_agg,
-                                                         tolerance=agg_tolerance)
+                            coords={
+                                "orig_cat_name": (
+                                    f"category ({cat_terminology_in})",
+                                    [cat_name],
+                                )
+                            }
+                        )
+                    data_country = data_country.pr.merge(
+                        data_agg, tolerance=agg_tolerance
+                    )
                 else:
                 else:
                     print(f"no data to aggregate category {cat_to_agg}")
                     print(f"no data to aggregate category {cat_to_agg}")
 
 
         # copy HFCs and PFCs with default factors
         # copy HFCs and PFCs with default factors
-        if 'basket_copy' in processing_info_country:
+        if "basket_copy" in processing_info_country:
             GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
             GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
             entities = processing_info_country["basket_copy"]["entities"]
             entities = processing_info_country["basket_copy"]["entities"]
             source_GWP = processing_info_country["basket_copy"]["source_GWP"]
             source_GWP = processing_info_country["basket_copy"]["source_GWP"]
             for entity in entities:
             for entity in entities:
-                data_source = data_country[f'{entity} ({source_GWP})']
+                data_source = data_country[f"{entity} ({source_GWP})"]
                 for GWP in GWPs_to_add:
                 for GWP in GWPs_to_add:
-                    data_GWP = data_source * \
-                               GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
+                    data_GWP = (
+                        data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
+                    )
                     data_GWP.attrs["entity"] = entity
                     data_GWP.attrs["entity"] = entity
                     data_GWP.attrs["gwp_context"] = GWP
                     data_GWP.attrs["gwp_context"] = GWP
                     data_country[f"{entity} ({GWP})"] = data_GWP
                     data_country[f"{entity} ({GWP})"] = data_GWP
 
 
         # aggregate gases if desired
         # aggregate gases if desired
-        if 'aggregate_gases' in processing_info_country:
+        if "aggregate_gases" in processing_info_country:
             # TODO: why use different code here than below. Can this fill non-existen
             # TODO: why use different code here than below. Can this fill non-existen
             #  gas baskets?
             #  gas baskets?
-            for case in processing_info_country['aggregate_gases'].keys():
-                case_info = processing_info_country['aggregate_gases'][case]
-                data_country[case_info['basket']] = \
-                    data_country.pr.fill_na_gas_basket_from_contents(
-                        **case_info)
+            for case in processing_info_country["aggregate_gases"].keys():
+                case_info = processing_info_country["aggregate_gases"][case]
+                data_country[
+                    case_info["basket"]
+                ] = data_country.pr.fill_na_gas_basket_from_contents(**case_info)
 
 
     # 3: map categories
     # 3: map categories
     if category_conversion is not None:
     if category_conversion is not None:
@@ -270,61 +311,74 @@ def process_data_for_country(
     # more general processing
     # more general processing
     # reduce categories to output cats
     # reduce categories to output cats
     if sectors_out is not None:
     if sectors_out is not None:
-        cats_to_keep = [cat for cat in
-                        data_country.coords[f'category ({cat_terminology_out})'].values
-                        if cat in sectors_out]
-        data_country = data_country.pr.loc[{'category': cats_to_keep}]
+        cats_to_keep = [
+            cat
+            for cat in data_country.coords[f"category ({cat_terminology_out})"].values
+            if cat in sectors_out
+        ]
+        data_country = data_country.pr.loc[{"category": cats_to_keep}]
 
 
     # create gas baskets
     # create gas baskets
     entities_present = set(data_country.data_vars)
     entities_present = set(data_country.data_vars)
     for basket in gas_baskets.keys():
     for basket in gas_baskets.keys():
-        basket_contents_present = [gas for gas in gas_baskets[basket] if
-                                   gas in entities_present]
+        basket_contents_present = [
+            gas for gas in gas_baskets[basket] if gas in entities_present
+        ]
         if len(basket_contents_present) > 0:
         if len(basket_contents_present) > 0:
             if basket in list(data_country.data_vars):
             if basket in list(data_country.data_vars):
                 data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
                 data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
-                    basket=basket, basket_contents=basket_contents_present,
-                    skipna=True, min_count=1)
+                    basket=basket,
+                    basket_contents=basket_contents_present,
+                    skipna=True,
+                    min_count=1,
+                )
             else:
             else:
                 try:
                 try:
-                    #print(data_country.data_vars)
-                    data_country[basket] = xr.full_like(data_country["CO2"],
-                                                        np.nan).pr.quantify(
-                        units="Gg CO2 / year")
-                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
-                                                  "gwp_context": basket.split(' ')[1][
-                                                                 1:-1]}
+                    # print(data_country.data_vars)
+                    data_country[basket] = xr.full_like(
+                        data_country["CO2"], np.nan
+                    ).pr.quantify(units="Gg CO2 / year")
+                    data_country[basket].attrs = {
+                        "entity": basket.split(" ")[0],
+                        "gwp_context": basket.split(" ")[1][1:-1],
+                    }
                     data_country[basket] = data_country.pr.gas_basket_contents_sum(
                     data_country[basket] = data_country.pr.gas_basket_contents_sum(
-                        basket=basket, basket_contents=basket_contents_present,
-                        min_count=1)
+                        basket=basket,
+                        basket_contents=basket_contents_present,
+                        min_count=1,
+                    )
                     entities_present.add(basket)
                     entities_present.add(basket)
                 except Exception as ex:
                 except Exception as ex:
-                    print(f"No gas basket created for {country_code}, {source}, "
-                          f"{scenario}: {ex}")
+                    print(
+                        f"No gas basket created for {country_code}, {source}, "
+                        f"{scenario}: {ex}"
+                    )
 
 
     # amend title and comment
     # amend title and comment
-    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
-                                                                    f"{date.today()}"
-    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
-                                                                    f"{date.today()}"
+    data_country.attrs["comment"] = (
+        data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
+    )
+    data_country.attrs["title"] = (
+        data_country.attrs["title"] + f" Processed on " f"{date.today()}"
+    )
 
 
     return data_country
     return data_country
 
 
 
 
 def convert_categories(
 def convert_categories(
-        ds_input: xr.Dataset,
-        conversion: Dict[str, Dict[str, str]],
-        #terminology_from: str,
-        terminology_to: str,
-        debug: bool=False,
-        tolerance: float=0.01,
-)->xr.Dataset:
+    ds_input: xr.Dataset,
+    conversion: Dict[str, Dict[str, str]],
+    # terminology_from: str,
+    terminology_to: str,
+    debug: bool = False,
+    tolerance: float = 0.01,
+) -> xr.Dataset:
     """
     """
     convert data from one category terminology to another
     convert data from one category terminology to another
     """
     """
     print(f"converting categories to {terminology_to}")
     print(f"converting categories to {terminology_to}")
 
 
-    if 'orig_cat_name' in ds_input.coords:
+    if "orig_cat_name" in ds_input.coords:
         cat_name_present = True
         cat_name_present = True
     else:
     else:
         cat_name_present = False
         cat_name_present = False
@@ -338,50 +392,67 @@ def convert_categories(
     ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
     ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
 
 
     # find categories present in dataset
     # find categories present in dataset
-    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
+    cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
 
 
     # restrict categories and map category names
     # restrict categories and map category names
-    if 'mapping' in conversion.keys():
-        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
-                                cat in cats_present]
-        ds_converted = ds_converted.pr.loc[
-            {'category': mapping_cats_present}]
-
-        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
-        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
-        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
-                                                   (f'category ({terminology_to})',
-                                                    to_cats)})
+    if "mapping" in conversion.keys():
+        mapping_cats_present = [
+            cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
+        ]
+        ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
+
+        from_cats = ds_converted.coords[f"category ({terminology_to})"].values
+        to_cats = pd.Series(from_cats).replace(conversion["mapping"])
+        ds_converted = ds_converted.assign_coords(
+            {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
+        )
 
 
     # redo the list of present cats after mapping, as we have new categories in the
     # redo the list of present cats after mapping, as we have new categories in the
     # target terminology now
     # target terminology now
-    cats_present_mapped = list(ds_converted.coords[f'category ('
-                                                   f'{terminology_to})'].values)
+    cats_present_mapped = list(
+        ds_converted.coords[f"category (" f"{terminology_to})"].values
+    )
     # aggregate categories
     # aggregate categories
-    if 'aggregate' in conversion:
-        aggregate_cats = conversion['aggregate']
+    if "aggregate" in conversion:
+        aggregate_cats = conversion["aggregate"]
         for cat_to_agg in aggregate_cats:
         for cat_to_agg in aggregate_cats:
             if debug:
             if debug:
                 print(f"Category: {cat_to_agg}")
                 print(f"Category: {cat_to_agg}")
-            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
-                           cat in cats_present_mapped]
+            source_cats = [
+                cat
+                for cat in aggregate_cats[cat_to_agg]["sources"]
+                if cat in cats_present_mapped
+            ]
             if debug:
             if debug:
                 print(source_cats)
                 print(source_cats)
-            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
-                dim='category', skipna=True, min_count=1)
-            nan_vars = [var for var in data_agg.data_vars if
-                        data_agg[var].isnull().all().data == True]
+            data_agg = ds_converted.pr.loc[{"category": source_cats}].pr.sum(
+                dim="category", skipna=True, min_count=1
+            )
+            nan_vars = [
+                var
+                for var in data_agg.data_vars
+                if data_agg[var].isnull().all().data == True
+            ]
             data_agg = data_agg.drop(nan_vars)
             data_agg = data_agg.drop(nan_vars)
             if len(data_agg.data_vars) > 0:
             if len(data_agg.data_vars) > 0:
-                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
+                data_agg = data_agg.expand_dims([f"category ({terminology_to})"])
                 data_agg = data_agg.assign_coords(
                 data_agg = data_agg.assign_coords(
-                    coords={f'category ({terminology_to})':
-                                (f'category ({terminology_to})', [cat_to_agg])})
+                    coords={
+                        f"category ({terminology_to})": (
+                            f"category ({terminology_to})",
+                            [cat_to_agg],
+                        )
+                    }
+                )
                 if cat_name_present:
                 if cat_name_present:
                     data_agg = data_agg.assign_coords(
                     data_agg = data_agg.assign_coords(
-                        coords={'orig_cat_name':
-                                    (f'category ({terminology_to})',
-                                     [aggregate_cats[cat_to_agg]['name']])})
+                        coords={
+                            "orig_cat_name": (
+                                f"category ({terminology_to})",
+                                [aggregate_cats[cat_to_agg]["name"]],
+                            )
+                        }
+                    )
                 ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
                 ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
                 cats_present_mapped.append(cat_to_agg)
                 cats_present_mapped.append(cat_to_agg)
             else:
             else:
@@ -391,9 +462,9 @@ def convert_categories(
 
 
 
 
 def get_country_name(
 def get_country_name(
-        country_code: str,
+    country_code: str,
 ) -> str:
 ) -> str:
-    """get country name from code """
+    """get country name from code"""
     if country_code in custom_country_mapping:
     if country_code in custom_country_mapping:
         country_name = custom_country_mapping[country_code]
         country_name = custom_country_mapping[country_code]
     else:
     else:
@@ -401,15 +472,16 @@ def get_country_name(
             country = pycountry.countries.get(alpha_3=country_code)
             country = pycountry.countries.get(alpha_3=country_code)
             country_name = country.name
             country_name = country.name
         except:
         except:
-            raise ValueError(f"Country code {country_code} can not be mapped to "
-                             f"any country")
+            raise ValueError(
+                f"Country code {country_code} can not be mapped to " f"any country"
+            )
 
 
     return country_name
     return country_name
 
 
 
 
 def get_country_code(
 def get_country_code(
-        country_name: str,
-)->str:
+    country_name: str,
+) -> str:
     """
     """
     obtain country code. If the input is a code it will be returned,
     obtain country code. If the input is a code it will be returned,
     if the input
     if the input
@@ -435,28 +507,31 @@ def get_country_code(
             country_code = country.alpha_3
             country_code = country.alpha_3
         except:
         except:
             try:
             try:
-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
+                country = pycountry.countries.search_fuzzy(
+                    country_name.replace("_", " ")
+                )
             except:
             except:
-                raise ValueError(f"Country name {country_name} can not be mapped to "
-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
+                raise ValueError(
+                    f"Country name {country_name} can not be mapped to "
+                    f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly."
+                )
             if len(country) > 1:
             if len(country) > 1:
                 country_code = None
                 country_code = None
                 for current_country in country:
                 for current_country in country:
                     if current_country.name == country_name:
                     if current_country.name == country_name:
                         country_code = current_country.alpha_3
                         country_code = current_country.alpha_3
                 if country_code is None:
                 if country_code is None:
-                    raise ValueError(f"Country name {country_name} has {len(country)} "
-                                     f"possible results for country codes.")
+                    raise ValueError(
+                        f"Country name {country_name} has {len(country)} "
+                        f"possible results for country codes."
+                    )
 
 
             country_code = country[0].alpha_3
             country_code = country[0].alpha_3
 
 
     return country_code
     return country_code
 
 
 
 
-def create_folder_mapping(
-        folder: str,
-        extracted: bool = False
-) -> None:
+def create_folder_mapping(folder: str, extracted: bool = False) -> None:
     """
     """
     Create a mapping from 3 letter ISO country codes to folders
     Create a mapping from 3 letter ISO country codes to folders
     based on the subfolders of the given folder. The mapping is
     based on the subfolders of the given folder. The mapping is
@@ -480,9 +555,9 @@ def create_folder_mapping(
 
 
     folder = root_path / folder
     folder = root_path / folder
     folder_mapping = {}
     folder_mapping = {}
-    #if not extracted:
+    # if not extracted:
     known_folders = custom_folders
     known_folders = custom_folders
-    #else:
+    # else:
     #    known_folders = {}
     #    known_folders = {}
 
 
     for item in folder.iterdir():
     for item in folder.iterdir():
@@ -491,7 +566,9 @@ def create_folder_mapping(
                 ISO3 = known_folders[item.name]
                 ISO3 = known_folders[item.name]
             else:
             else:
                 try:
                 try:
-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
+                    country = pycountry.countries.search_fuzzy(
+                        item.name.replace("_", " ")
+                    )
                     if len(country) > 1:
                     if len(country) > 1:
                         ISO3 = None
                         ISO3 = None
                         for current_country in country:
                         for current_country in country:
@@ -516,8 +593,8 @@ def create_folder_mapping(
 
 
 # TODO add crf
 # TODO add crf
 def get_country_submissions(
 def get_country_submissions(
-        country_name: str,
-        print_sub: bool = True,
+    country_name: str,
+    print_sub: bool = True,
 ) -> Dict[str, List[str]]:
 ) -> Dict[str, List[str]]:
     """
     """
     Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
     Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
@@ -585,8 +662,8 @@ def get_country_submissions(
 
 
 
 
 def get_country_datasets(
 def get_country_datasets(
-        country_name: str,
-        print_ds: bool = True,
+    country_name: str,
+    print_ds: bool = True,
 ) -> Dict[str, List[str]]:
 ) -> Dict[str, List[str]]:
     """
     """
     Input is a three letter ISO code for a country, or the country's name.
     Input is a three letter ISO code for a country, or the country's name.
@@ -638,35 +715,42 @@ def get_country_datasets(
             else:
             else:
                 country_folder = folder_mapping[country_code]
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping json file. Should be str."
+                    )
 
 
                 datasets_current_folder = {}
                 datasets_current_folder = {}
                 current_folder = item / country_folder
                 current_folder = item / country_folder
 
 
                 for data_file in current_folder.iterdir():
                 for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
                         if data_file.stem in datasets_current_folder:
                         if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                            datasets_current_folder[data_file.stem].append(
+                                data_file.suffix
+                            )
                         else:
                         else:
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
 
 
                 for dataset in datasets_current_folder:
                 for dataset in datasets_current_folder:
                     # process filename to get submission
                     # process filename to get submission
-                    parts = dataset.split('_')
+                    parts = dataset.split("_")
                     if parts[0] != country_code:
                     if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
-                            dataset
+                        cleaned_datasets_current_folder[
+                            f"Wrong code: {parts[0]}"
+                        ] = dataset
                     else:
                     else:
-                        terminology = "_".join(parts[3 : ])
+                        terminology = "_".join(parts[3:])
                         key = f"{parts[1]} ({parts[2]}, {terminology})"
                         key = f"{parts[1]} ({parts[2]}, {terminology})"
                         data_info = ""
                         data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
+                        if ".nc" in datasets_current_folder[dataset]:
                             data_info = data_info + "NF (.nc), "
                             data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                        if (".csv" in datasets_current_folder[dataset]) and (
+                            ".yaml" in datasets_current_folder[dataset]
+                        ):
                             data_info = data_info + "IF (.yaml + .csv), "
                             data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
+                        elif ".csv" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF? (.csv), "
                             data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
+                        elif ".yaml" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF (.yaml), "
                             data_info = data_info + "incomplete IF (.yaml), "
 
 
                         code_file = get_code_file(country_code, parts[1])
                         code_file = get_code_file(country_code, parts[1])
@@ -680,7 +764,9 @@ def get_country_datasets(
                 if print_ds:
                 if print_ds:
                     if cleaned_datasets_current_folder:
                     if cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                            print(
+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
+                            )
                     else:
                     else:
                         print("No data available")
                         print("No data available")
                     print("")
                     print("")
@@ -708,34 +794,42 @@ def get_country_datasets(
             else:
             else:
                 country_folder = folder_mapping[country_code]
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping json file. Should be str."
+                    )
 
 
                 datasets_current_folder = {}
                 datasets_current_folder = {}
                 current_folder = item / country_folder
                 current_folder = item / country_folder
 
 
                 for data_file in current_folder.iterdir():
                 for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
                         if data_file.stem in datasets_current_folder:
                         if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                            datasets_current_folder[data_file.stem].append(
+                                data_file.suffix
+                            )
                         else:
                         else:
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
                             datasets_current_folder[data_file.stem] = [data_file.suffix]
 
 
                 for dataset in datasets_current_folder:
                 for dataset in datasets_current_folder:
                     # process filename to get submission
                     # process filename to get submission
-                    parts = dataset.split('_')
+                    parts = dataset.split("_")
                     if parts[0] != country_code:
                     if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
+                        cleaned_datasets_current_folder[
+                            f"Wrong UNFCCC_GHG_data: {parts[0]}"
+                        ] = dataset
                     else:
                     else:
-                        terminology = "_".join(parts[3 : ])
+                        terminology = "_".join(parts[3:])
                         key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
                         key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
                         data_info = ""
                         data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
+                        if ".nc" in datasets_current_folder[dataset]:
                             data_info = data_info + "NF (.nc), "
                             data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                        if (".csv" in datasets_current_folder[dataset]) and (
+                            ".yaml" in datasets_current_folder[dataset]
+                        ):
                             data_info = data_info + "IF (.yaml + .csv), "
                             data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
+                        elif ".csv" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF? (.csv), "
                             data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
+                        elif ".yaml" in datasets_current_folder[dataset]:
                             data_info = data_info + "incomplete IF (.yaml), "
                             data_info = data_info + "incomplete IF (.yaml), "
 
 
                         cleaned_datasets_current_folder[key] = data_info
                         cleaned_datasets_current_folder[key] = data_info
@@ -743,7 +837,9 @@ def get_country_datasets(
                 if print_ds:
                 if print_ds:
                     if cleaned_datasets_current_folder:
                     if cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
                         for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                            print(
+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
+                            )
                     else:
                     else:
                         print("No data available")
                         print("No data available")
                     print("")
                     print("")
@@ -759,9 +855,9 @@ def get_country_datasets(
 
 
 
 
 def get_code_file(
 def get_code_file(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
 ) -> Path:
 ) -> Path:
     """
     """
     For given country name and submission find the script that creates the data
     For given country name and submission find the script that creates the data
@@ -813,13 +909,17 @@ def get_code_file(
         for file in country_folder.iterdir():
         for file in country_folder.iterdir():
             if file.match(code_file_name_candidate):
             if file.match(code_file_name_candidate):
                 if code_file_path is not None:
                 if code_file_path is not None:
-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
-                                     f"{code_file_path} and file.name. "
-                                     f"Please use only one file with name "
-                                     f"'read_ISO3_submission_XXX.YYY'.")
+                    raise ValueError(
+                        f"Found multiple UNFCCC_GHG_data candidates: "
+                        f"{code_file_path} and file.name. "
+                        f"Please use only one file with name "
+                        f"'read_ISO3_submission_XXX.YYY'."
+                    )
                 else:
                 else:
                     if print_info:
                     if print_info:
-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
+                        print(
+                            f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}"
+                        )
                 code_file_path = file
                 code_file_path = file
 
 
     if code_file_path is not None:
     if code_file_path is not None:
@@ -828,8 +928,10 @@ def get_code_file(
         return None
         return None
 
 
 
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
-    '''
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
     Function to fix rows that have been split during reading from pdf
     Function to fix rows that have been split during reading from pdf
     This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
     This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
 
 
@@ -838,18 +940,20 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
     :param col_to_use:
     :param col_to_use:
     :param n_rows:
     :param n_rows:
     :return:
     :return:
-    '''
+    """
     for row in rows_to_fix:
     for row in rows_to_fix:
-        #print(row)
+        # print(row)
         # find the row number and collect the row and the next two rows
         # find the row number and collect the row and the next two rows
         index = data.loc[data[col_to_use] == row].index
         index = data.loc[data[col_to_use] == row].index
-        #print(list(index))
+        # print(list(index))
         if not list(index):
         if not list(index):
             print(f"Can't merge split row {row}")
             print(f"Can't merge split row {row}")
             print(data[col_to_use])
             print(data[col_to_use])
-        #print(f"Merging split row {row} for table {page}")
+        # print(f"Merging split row {row} for table {page}")
         loc = data.index.get_loc(index[0])
         loc = data.index.get_loc(index[0])
-        if n_rows == -3:
+        if n_rows == -2:
+            locs_to_merge = list(range(loc - 1, loc + 1))
+        elif n_rows == -3:
             locs_to_merge = list(range(loc - 1, loc + 2))
             locs_to_merge = list(range(loc - 1, loc + 2))
         elif n_rows == -5:
         elif n_rows == -5:
             locs_to_merge = list(range(loc - 1, loc + 4))
             locs_to_merge = list(range(loc - 1, loc + 4))
@@ -858,7 +962,7 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         rows_to_merge = data.iloc[locs_to_merge]
         rows_to_merge = data.iloc[locs_to_merge]
         indices_to_merge = rows_to_merge.index
         indices_to_merge = rows_to_merge.index
         # join the three rows
         # join the three rows
-        new_row = rows_to_merge.agg(' '.join)
+        new_row = rows_to_merge.agg(" ".join)
         # replace the double spaces that are created
         # replace the double spaces that are created
         # must be done here and not at the end as splits are not always
         # must be done here and not at the end as splits are not always
         # the same and join would produce different col values
         # the same and join would produce different col values
@@ -866,6 +970,10 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         new_row = new_row.str.replace("N O", "NO")
         new_row = new_row.str.replace("N O", "NO")
         new_row = new_row.str.replace(", N", ",N")
         new_row = new_row.str.replace(", N", ",N")
         new_row = new_row.str.replace("- ", "-")
         new_row = new_row.str.replace("- ", "-")
+        # replace spaces in numbers
+        pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
+        repl = lambda m: f"{m.group('first')}{m.group('last')}"
+        new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
         data = data.drop(indices_to_merge[1:])
-    return data
+    return data

+ 3 - 0
pyproject.toml

@@ -6,3 +6,6 @@ requires = [
 ]
 ]
 build-backend = "setuptools.build_meta"
 build-backend = "setuptools.build_meta"
 
 
+[tool.black]
+line-length = 88
+

+ 2 - 1
setup.cfg

@@ -30,7 +30,7 @@ packages =
     UNFCCC_GHG_data.UNFCCC_downloader
     UNFCCC_GHG_data.UNFCCC_downloader
     UNFCCC_GHG_data.UNFCCC_DI_reader
     UNFCCC_GHG_data.UNFCCC_DI_reader
     UNFCCC_GHG_data.helper
     UNFCCC_GHG_data.helper
-    #UNFCCC_GHG_data.datasets
+#UNFCCC_GHG_data.datasets
 python_requires = >=3.8
 python_requires = >=3.8
 setup_requires =
 setup_requires =
     setuptools_scm
     setuptools_scm
@@ -70,6 +70,7 @@ dev =
     jupyter
     jupyter
     dask
     dask
     ipympl
     ipympl
+    black
 
 
 
 
 [options.package_data]
 [options.package_data]