1 рік тому · 61639b9f1e
--- a/UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py
@@ -0,0 +1,560 @@
 
				+table_def_templates = {
			
 
				+    "300": {  # 300
			
 
				+        "area": ["69,457,727,78"],
			
 
				+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la fabricación",
			
 
				+                "Productos no energéticos de combustibles y de uso",
			
 
				+                "Uso de productos sustitutos de las sustancias que",
			
 
				+            ],
			
 
				+            2: [
			
 
				+                "1A Actividades de quema de combustible",
			
 
				+                "2A Industria de los minerales",
			
 
				+                "2B Industria química",
			
 
				+                "2C Industria de los metales",
			
 
				+                "2E Industria electrónica",
			
 
				+                "3A Ganado",
			
 
				+                "3A1 Fermentación entérica",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "301": {  # 301
			
 
				+        "area": ["72,542,727,99"],
			
 
				+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Fuentes agregadas y fuentes de emisión no CO2 de",
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo del",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+            2: [
			
 
				+                "3A2 Manejo del estiércol",
			
 
				+                "3C1 Emisiones por quema de biomasa",
			
 
				+                "3C3 Aplicación de urea",
			
 
				+                "3C7 Cultivo de arroz",
			
 
				+                "A Disposición de residuos sólidos",
			
 
				+                "B Tratamiento biológico de residuos",
			
 
				+                "C Incineración de residuos",
			
 
				+                "D Tratamiento y descarga de aguas residuales",
			
 
				+                "Búnker internacional",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "302": {  # 302
			
 
				+        "area": ["72,510,727,79"],
			
 
				+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la fabricación",
			
 
				+                "Productos no energéticos de combustibles y de",
			
 
				+                "Uso de productos sustitutos de las sustancias que",
			
 
				+                "Fuentes agregadas y fuentes de emision no CO2",
			
 
				+            ],
			
 
				+            -3: ["Total de las emisiones y remociones nacionales"],
			
 
				+        },
			
 
				+    },
			
 
				+    "303": {  # 303
			
 
				+        "area": ["72,540,727,127"],
			
 
				+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+            2: ["Aviación internacional"],
			
 
				+        },
			
 
				+    },
			
 
				+    "304": {  # 304
			
 
				+        "area": ["72,510,727,70"],
			
 
				+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la",
			
 
				+                "Productos no energéticos de combustibles y de",
			
 
				+                "Uso de productos sustitutos de las sustancias",
			
 
				+                "Fuentes agregadas y fuentes de emisión no CO2",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "305": {  # 305
			
 
				+        "area": ["72,540,727,108"],
			
 
				+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "306": {  # 306
			
 
				+        "area": ["72,510,727,70"],
			
 
				+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la",
			
 
				+                "Productos no energéticos de combustibles y",
			
 
				+                "Uso de productos sustitutos de las sustancias",
			
 
				+                "Fuentes agregadas y fuentes de emisión no",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "307": {  # 307
			
 
				+        "area": ["72,540,727,108"],
			
 
				+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "308": {  # 308
			
 
				+        "area": ["72,510,727,70"],
			
 
				+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la fabricación",
			
 
				+                "Productos no energéticos de combustibles y de",
			
 
				+                "Uso de productos sustitutos de las sustancias que",
			
 
				+                "Fuentes agregadas y fuentes de emisión no CO2",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "309": {  # 309
			
 
				+        "area": ["72,540,727,117"],
			
 
				+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo del",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "310": {  # 310
			
 
				+        "area": ["72,510,727,70"],
			
 
				+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Industrias manufactureras y de la",
			
 
				+                "Emisiones fugitivas provenientes de la fabricación",
			
 
				+                "Productos no energéticos de combustibles y de",
			
 
				+                "Uso de productos sustitutos de las sustancias que",
			
 
				+                "Fuentes agregadas y fuentes de emisión no CO2",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "311": {  # 311
			
 
				+        "area": ["72,540,727,110"],
			
 
				+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones directas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+            -2: ["Emisiones de CO2 de la biomasa"],
			
 
				+        },
			
 
				+    },
			
 
				+    "312": {  # 312
			
 
				+        "area": ["72,510,727,70"],
			
 
				+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones fugitivas provenientes de la fabricación de",
			
 
				+                "Productos no energéticos de combustibles y de uso de",
			
 
				+                "Uso de productos sustitutos de las sustancias que",
			
 
				+                "Fuentes agregadas y fuentes de emisión no CO2 de la",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    "313": {  # 313
			
 
				+        "area": ["72,540,727,90"],
			
 
				+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
			
 
				+        "rows_to_fix": {
			
 
				+            3: [
			
 
				+                "Emisiones indirectas de N2O en suelos",
			
 
				+                "Emisiones indirectas de N2O por manejo del",
			
 
				+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+header = {
			
 
				+    "entity": [
			
 
				+        "Categorías de emisiones y sumideros de GEI",
			
 
				+        "Emisiones/remociones netas de CO2",
			
 
				+        "CH4",
			
 
				+        "N2O",
			
 
				+        "HFC",
			
 
				+        "PFC",
			
 
				+        "SF6",
			
 
				+        "CO",
			
 
				+        "NOx",
			
 
				+        "COVDM",
			
 
				+        "SOX",
			
 
				+        "Emisiones/remociones totales de GEI",
			
 
				+    ],
			
 
				+    "unit": [
			
 
				+        "",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "GgCO2eq",
			
 
				+        "GgCO2eq",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "Gg",
			
 
				+        "GgCO2eq",
			
 
				+    ],
			
 
				+}
			
 
				+
			
 
				+table_defs = {
			
 
				+    "300": {
			
 
				+        "templates": ["300"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4, 5],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2000,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "301": {
			
 
				+        "templates": ["301"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4, 5],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2000,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "302": {
			
 
				+        "templates": ["302"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2005,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "303": {
			
 
				+        "templates": ["303"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2005,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "304": {
			
 
				+        "templates": ["304"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2010,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "305": {
			
 
				+        "templates": ["305"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2010,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "306": {
			
 
				+        "templates": ["306"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2012,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "307": {
			
 
				+        "templates": ["307"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2012,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "308": {
			
 
				+        "templates": ["308"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2014,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "309": {
			
 
				+        "templates": ["309"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2014,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "310": {
			
 
				+        "templates": ["310"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2016,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "311": {
			
 
				+        "templates": ["311"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2016,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "312": {
			
 
				+        "templates": ["312"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2019,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+    "313": {
			
 
				+        "templates": ["313"],
			
 
				+        "header": header,
			
 
				+        "drop_rows": [0, 1, 2, 3, 4],
			
 
				+        "category_col": "Categorías de emisiones y sumideros de GEI",
			
 
				+        "year": 2019,
			
 
				+        "coords_value_mapping": "default",
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+cat_names_fix = {
			
 
				+    "Industrias manufactureras y de la 1A2 construcción":
			
 
				+        "1A2 Industrias manufactureras y de la construcción",
			
 
				+    "Emisiones fugitivas provenientes de la fabricación 1B de combustibles":
			
 
				+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
			
 
				+    "Emisiones fugitivas provenientes de la 1B fabricación de combustibles":
			
 
				+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
			
 
				+    "Emisiones fugitivas provenientes de la fabricación de 1B combustibles":
			
 
				+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
			
 
				+    "Productos no energéticos de combustibles y de uso 2D de solventes":
			
 
				+        "2D Productos no energéticos de combustibles y de uso de solventes",
			
 
				+    "Productos no energéticos de combustibles y de 2D uso de solventes":
			
 
				+        "2D Productos no energéticos de combustibles y de uso de solventes",
			
 
				+    "Uso de productos sustitutos de las sustancias que 2F agotan la capa de ozono":
			
 
				+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
			
 
				+    "Uso de productos sustitutos de las sustancias 2F que agotan la capa de ozono":
			
 
				+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
			
 
				+    "Fuentes agregadas y fuentes de emisión no CO2 de 3C la tierra":
			
 
				+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
			
 
				+    "Fuentes agregadas y fuentes de emision no CO2 3C de la tierra":
			
 
				+        "3C Fuentes agregadas y fuentes de emision no CO2 de la tierra",
			
 
				+    "Fuentes agregadas y fuentes de emisión no CO2 3C de la tierra":
			
 
				+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
			
 
				+    "Fuentes agregadas y fuentes de emisión no 3C CO2 de la tierra":
			
 
				+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
			
 
				+    "Fuentes agregadas y fuentes de emisión no CO2 de la 3C tierra":
			
 
				+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
			
 
				+    "Emisiones directas de N2O en suelos 3C4 gestionados":
			
 
				+        "3C4 Emisiones directas de N2O en suelos gestionados",
			
 
				+    "Emisiones indirectas de N2O en suelos 3C5 gestionados":
			
 
				+        "3C5 Emisiones indirectas de N2O en suelos gestionados",
			
 
				+    "Emisiones indirectas de N2O por manejo del 3C6 estiércol":
			
 
				+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
			
 
				+    "Emisiones indirectas de N2O por manejo 3C6 del estiércol":
			
 
				+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
			
 
				+    "Emisiones indirectas de N2O por 3C6 manejo del estiércol":
			
 
				+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
			
 
				+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y 4 SILVICULTURA":
			
 
				+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
			
 
				+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA 4 Y SILVICULTURA":
			
 
				+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
			
 
				+}
			
 
				+
			
 
				+values_replacement = {
			
 
				+    #    '': '-',
			
 
				+    " ": "",
			
 
				+}
			
 
				+
			
 
				+gwp_to_use = "AR5GWP100"
			
 
				+
			
 
				+index_cols = ["orig_cat_name"]
			
 
				+cols_for_space_stripping = index_cols
			
 
				+
			
 
				+unit_row = "header"
			
 
				+
			
 
				+## parameters part 2: conversion to PRIMAP2 interchnage format
			
 
				+
			
 
				+cats_remove = ["Partidas informativas"]
			
 
				+
			
 
				+cat_codes_manual = {
			
 
				+    "Emisiones de CO2 de la biomasa": "M.BIO",
			
 
				+    "Total de las emisiones y remociones nacionales": "0",
			
 
				+    "Búnker internacional": "M.BK",
			
 
				+    "Aviación internacional": "M.BK.A",
			
 
				+    "Transporte marítimo y fluvial internacional": "M.BK.M",
			
 
				+    "A Disposición de residuos sólidos": "5.A",
			
 
				+    "B Tratamiento biológico de residuos": "5.B",
			
 
				+    "C Incineración de residuos": "5.C",
			
 
				+    "D Tratamiento y descarga de aguas residuales": "5.D",
			
 
				+    "Tierras": "M.2006.3.B",
			
 
				+}
			
 
				+
			
 
				+
			
 
				+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
			
 
				+
			
 
				+# special header as category code and name in one column
			
 
				+header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
			
 
				+
			
 
				+coords_terminologies = {
			
 
				+    "area": "ISO3",
			
 
				+    "category": "IPCC1996_2006_PER_INV",
			
 
				+    "scenario": "PRIMAP",
			
 
				+}
			
 
				+
			
 
				+coords_terminologies_2006 = {
			
 
				+    "area": "ISO3",
			
 
				+    "category": "IPCC2006_PRIMAP",
			
 
				+    "scenario": "PRIMAP",
			
 
				+}
			
 
				+
			
 
				+coords_defaults = {
			
 
				+    "source": "PER-GHG-inventory ",
			
 
				+    "provenance": "measured",
			
 
				+    "area": "PER",
			
 
				+    "scenario": "BUR3",
			
 
				+}
			
 
				+
			
 
				+coords_value_mapping = {
			
 
				+    "default": {
			
 
				+        "unit": "PRIMAP1",
			
 
				+        "entity": {
			
 
				+            "Emisiones/remociones netas de CO2": "CO2",
			
 
				+            "CH4": "CH4",
			
 
				+            "N2O": "N2O",
			
 
				+            "HFC": f"HFCS ({gwp_to_use})",
			
 
				+            "PFC": f"PFCS ({gwp_to_use})",
			
 
				+            "SF6": "SF6",
			
 
				+            "CO": "CO",
			
 
				+            "NOx": "NOX",
			
 
				+            "COVDM": "NMVOC",
			
 
				+            "SOx": "SOX",
			
 
				+            "Emisiones/remociones totales de GEI": f"KYOTOGHG ({gwp_to_use})",
			
 
				+        },
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
			
 
				+
			
 
				+add_coords_cols = {
			
 
				+    "orig_cat_name": ["orig_cat_name", "category"],
			
 
				+}
			
 
				+
			
 
				+filter_remove = {
			
 
				+    # "f1" :{
			
 
				+    #     "entity": ["HFC-125", "HFC-134a", "HFC-143a", "HFC-152a", "HFC-227ea",
			
 
				+    #                "HFC-23", "HFC-32", "HFC-41", "HFC-43-10mee", "PFC-116",
			
 
				+    #                "PFC-14", "PFC-218", "PFC-318", "NF3", "SF6"],
			
 
				+    #     "category": "2"
			
 
				+    # }
			
 
				+}
			
 
				+
			
 
				+meta_data = {
			
 
				+    "references": "https://unfccc.int/documents/",
			
 
				+    "rights": "",
			
 
				+    "contact": "mail@johannes-guetschow.de",
			
 
				+    "title": "",
			
 
				+    "comment": "Read fom pdf file by Johannes Gütschow",
			
 
				+    "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
			
 
				+}
			
 
				+
			
 
				+
			
 
				+## processing
			
 
				+cat_conversion = {
			
 
				+    'mapping': {
			
 
				+        '0': '0',
			
 
				+        '1': '1',
			
 
				+        '1.A': '1.A',
			
 
				+        '1.A.1': '1.A.1',
			
 
				+        '1.A.2': '1.A.2',
			
 
				+        '1.A.3': '1.A.3',
			
 
				+        '1.A.4': '1.A.4',
			
 
				+        '1.A.5': '1.A.5',
			
 
				+        '1.B': '1.B',
			
 
				+        '1.B.1': '1.B.1',
			
 
				+        '1.B.2': '1.B.2',
			
 
				+        '2': '2',
			
 
				+        '2.A': '2.A',
			
 
				+        '2.B': '2.B',
			
 
				+        '2.C': '2.C',
			
 
				+        '2.D': '2.D',
			
 
				+        '2.E': '2.E',
			
 
				+        '2.F': '2.F',
			
 
				+        '2.G': '2.G',
			
 
				+        '2.H': '2.H',
			
 
				+        '3': 'M.AG',
			
 
				+        '3.A': '3.A',
			
 
				+        '3.A.1': '3.A.1',
			
 
				+        '3.A.2': '3.A.2',
			
 
				+        '3.C': '3.C',
			
 
				+        '3.C.1': '3.C.1',
			
 
				+        '3.C.2': '3.C.2',
			
 
				+        '3.C.3': '3.C.3',
			
 
				+        '3.C.4': '3.C.4',
			
 
				+        '3.C.5': '3.C.5',
			
 
				+        '3.C.6': '3.C.6',
			
 
				+        '3.C.7': '3.C.7',
			
 
				+        '4': 'M.LULUCF',
			
 
				+        'M.2006.3.B': '3.B',
			
 
				+        '4.A': '3.B.1',
			
 
				+        '4.B': '3.B.2',
			
 
				+        '4.C': '3.B.3',
			
 
				+        '4.D': '3.B.4',
			
 
				+        '4.E': '3.B.5',
			
 
				+        '4.F': '3.B.6',
			
 
				+        '4.G': '3.D.1',
			
 
				+        '5': '4',
			
 
				+        '5.A': '4.A',
			
 
				+        '5.B': '4.B',
			
 
				+        '5.C': '4.C',
			
 
				+        '5.D': '4.D',
			
 
				+        'M.BK': 'M.BK',
			
 
				+        'M.BK.A': 'M.BK.A',
			
 
				+        'M.BK.M': 'M.BM.M',
			
 
				+        'M.BIO': 'M.BIO',
			
 
				+    },
			
 
				+    'aggregate': {
			
 
				+        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
			
 
				+              'name': 'IPPU'},
			
 
				+        'M.3.C.AG': {
			
 
				+            'sources': ['3.C'],
			
 
				+            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
			
 
				+        'M.AG.ELV': {'sources': ['M.3.C.AG'],
			
 
				+                     'name': 'Agriculture excluding livestock emissions'},
			
 
				+        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
			
 
				+        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+processing_info = {
			
 
				+    'basket_copy': {
			
 
				+        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
			
 
				+        'entities': ["HFCS", "PFCS"],
			
 
				+        'source_GWP': gwp_to_use,
			
 
				+    },
			
 
				+}
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py
@@ -0,0 +1,290 @@
 
				+# read Singapore fifth BUR from pdf
			
 
				+
			
 
				+
			
 
				+import camelot
			
 
				+import primap2 as pm2
			
 
				+import pandas as pd
			
 
				+
			
 
				+import locale
			
 
				+
			
 
				+from UNFCCC_GHG_data.helper import process_data_for_country, gas_baskets
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				+from UNFCCC_GHG_data.helper import fix_rows
			
 
				+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
			
 
				+from config_PER_BUR3 import table_def_templates, table_defs, index_cols
			
 
				+from config_PER_BUR3 import values_replacement, header_long, cats_remove
			
 
				+from config_PER_BUR3 import cat_codes_manual, cat_code_regexp, cat_names_fix
			
 
				+from config_PER_BUR3 import coords_cols, coords_terminologies, coords_defaults
			
 
				+from config_PER_BUR3 import coords_terminologies_2006
			
 
				+from config_PER_BUR3 import coords_value_mapping, meta_data, filter_remove
			
 
				+from config_PER_BUR3 import processing_info, cat_conversion
			
 
				+
			
 
				+### general configuration
			
 
				+input_folder = downloaded_data_path / "UNFCCC" / "Peru" / "BUR3"
			
 
				+output_folder = extracted_data_path / "UNFCCC" / "Peru"
			
 
				+if not output_folder.exists():
			
 
				+    output_folder.mkdir()
			
 
				+
			
 
				+output_filename = "PER_BUR3_2023_"
			
 
				+inventory_file_pdf = "Tercer_BUR_Per%C3%BA_Jun2023.pdf"
			
 
				+# years_to_read = range(1990, 2018 + 1)
			
 
				+
			
 
				+# define locale to use for str to float conversion
			
 
				+locale_to_use = "es_PE.UTF-8"
			
 
				+locale.setlocale(locale.LC_NUMERIC, locale_to_use)
			
 
				+
			
 
				+pagesToRead = table_defs.keys()
			
 
				+
			
 
				+compression = dict(zlib=True, complevel=9)
			
 
				+
			
 
				+## part 1: read the data from pdf
			
 
				+### part 1.a: 2016 inventory
			
 
				+
			
 
				+data_pm2 = None
			
 
				+for page in pagesToRead:
			
 
				+    print(f"++++++++++++++++++++++++++++++++")
			
 
				+    print(f"+++++ Working on page {page} ++++++")
			
 
				+    print(f"++++++++++++++++++++++++++++++++")
			
 
				+
			
 
				+    df_this_page = None
			
 
				+    for table_on_page in table_defs[page]["templates"]:
			
 
				+        print(f"Reading table {table_on_page}")
			
 
				+        area = table_def_templates[table_on_page]["area"]
			
 
				+        cols = table_def_templates[table_on_page]["cols"]
			
 
				+        tables = camelot.read_pdf(
			
 
				+            str(input_folder / inventory_file_pdf),
			
 
				+            pages=str(page),
			
 
				+            flavor="stream",
			
 
				+            table_areas=area,
			
 
				+            columns=cols,
			
 
				+        )
			
 
				+
			
 
				+        df_current = tables[0].df.copy(deep=True)
			
 
				+        # drop the old header
			
 
				+        if "drop_rows" in table_defs[page].keys():
			
 
				+            df_current = df_current.drop(table_defs[page]["drop_rows"])
			
 
				+        elif "drop_rows" in table_def_templates[table_on_page].keys():
			
 
				+            df_current = df_current.drop(
			
 
				+                table_def_templates[table_on_page]["drop_rows"]
			
 
				+            )
			
 
				+        # add new header
			
 
				+        if "header" in table_defs[page].keys():
			
 
				+            df_current.columns = pd.MultiIndex.from_tuples(
			
 
				+                zip(
			
 
				+                    table_defs[page]["header"]["entity"],
			
 
				+                    table_defs[page]["header"]["unit"],
			
 
				+                )
			
 
				+            )
			
 
				+        else:
			
 
				+            df_current.columns = pd.MultiIndex.from_tuples(
			
 
				+                zip(
			
 
				+                    table_def_templates[table_on_page]["header"]["entity"],
			
 
				+                    table_def_templates[table_on_page]["header"]["unit"],
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # drop cols if necessary
			
 
				+        if "drop_cols" in table_defs[page].keys():
			
 
				+            # print(df_current.columns.values)
			
 
				+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
			
 
				+        elif "drop_cols" in table_def_templates[table_on_page].keys():
			
 
				+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
			
 
				+
			
 
				+        # rename category column
			
 
				+        df_current.rename(
			
 
				+            columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
			
 
				+        )
			
 
				+
			
 
				+        # replace double \n
			
 
				+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
			
 
				+        # replace double and triple spaces
			
 
				+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
			
 
				+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
			
 
				+
			
 
				+        # fix the split rows
			
 
				+        for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
			
 
				+            df_current = fix_rows(
			
 
				+                df_current,
			
 
				+                table_def_templates[table_on_page]["rows_to_fix"][n_rows],
			
 
				+                index_cols[0],
			
 
				+                n_rows,
			
 
				+            )
			
 
				+
			
 
				+        # replace category names with typos
			
 
				+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
			
 
				+
			
 
				+        # replace empty stings
			
 
				+        df_current = df_current.replace(values_replacement)
			
 
				+
			
 
				+        # set index
			
 
				+        # df_current = df_current.set_index(index_cols)
			
 
				+        # strip trailing and leading  and remove "^"
			
 
				+        for col in df_current.columns.values:
			
 
				+            df_current[col] = df_current[col].str.strip()
			
 
				+            df_current[col] = df_current[col].str.replace("^", "")
			
 
				+
			
 
				+        # print(df_current)
			
 
				+        # aggregate dfs for this page
			
 
				+        if df_this_page is None:
			
 
				+            df_this_page = df_current.copy(deep=True)
			
 
				+        else:
			
 
				+            # find intersecting cols
			
 
				+            cols_this_page = df_this_page.columns.values
			
 
				+            # print(f"cols this page: {cols_this_page}")
			
 
				+            cols_current = df_current.columns.values
			
 
				+            # print(f"cols current: {cols_current}")
			
 
				+            cols_both = list(set(cols_this_page).intersection(set(cols_current)))
			
 
				+            # print(f"cols both: {cols_both}")
			
 
				+            if len(cols_both) > 0:
			
 
				+                df_this_page = df_this_page.merge(
			
 
				+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
			
 
				+                )
			
 
				+            else:
			
 
				+                df_this_page = df_this_page.merge(
			
 
				+                    df_current,
			
 
				+                    how="outer",
			
 
				+                    left_index=True,
			
 
				+                    right_index=True,
			
 
				+                    suffixes=(None, None),
			
 
				+                )
			
 
				+
			
 
				+            df_this_page = df_this_page.groupby(index_cols).first().reset_index()
			
 
				+            # print(df_this_page)
			
 
				+            # df_all = df_all.join(df_current, how='outer')
			
 
				+
			
 
				+    # set index and convert to long format
			
 
				+    df_this_page = df_this_page.set_index(index_cols)
			
 
				+    df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
			
 
				+        df_this_page, table_defs[page]["year"], header_long
			
 
				+    )
			
 
				+
			
 
				+    # drop the rows with memo items etc
			
 
				+    for cat in cats_remove:
			
 
				+        df_this_page_long = df_this_page_long.drop(
			
 
				+            df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
			
 
				+        )
			
 
				+
			
 
				+    # make a copy of the categories row
			
 
				+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
			
 
				+
			
 
				+    # replace cat names by codes in col "Categories"
			
 
				+    # first the manual replacements
			
 
				+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
			
 
				+        cat_codes_manual
			
 
				+    )
			
 
				+    # then the regex replacements
			
 
				+    repl = lambda m: convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
			
 
				+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
			
 
				+        :, "category"
			
 
				+    ].str.replace(cat_code_regexp, repl, regex=True)
			
 
				+    df_this_page_long.loc[:, "category"].unique()
			
 
				+
			
 
				+    # strip spaces in data col
			
 
				+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.strip()
			
 
				+
			
 
				+    df_this_page_long = df_this_page_long.reset_index(drop=True)
			
 
				+
			
 
				+    # make sure all col headers are str
			
 
				+    df_this_page_long.columns = df_this_page_long.columns.map(str)
			
 
				+
			
 
				+    # remove thousands separators as pd.to_numeric can't deal with that
			
 
				+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
			
 
				+        ".", ""
			
 
				+    )
			
 
				+    pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
			
 
				+    repl = lambda m: f"{m.group('first')}.{m.group('last')}"
			
 
				+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
			
 
				+        pat, repl, regex=True
			
 
				+    )
			
 
				+
			
 
				+    # df_this_page_long["data"] = df_this_page_long["data"].str.replace("^.$","",
			
 
				+    #                                                                   regex=True)
			
 
				+
			
 
				+    # drop orig cat name as it's not unique over all tables (keep until here in case
			
 
				+    # it's needed for debugging)
			
 
				+    df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
			
 
				+
			
 
				+    data_page_if = pm2.pm2io.convert_long_dataframe_if(
			
 
				+        df_this_page_long,
			
 
				+        coords_cols=coords_cols,
			
 
				+        # add_coords_cols=add_coords_cols,
			
 
				+        coords_defaults=coords_defaults,
			
 
				+        coords_terminologies=coords_terminologies,
			
 
				+        coords_value_mapping=coords_value_mapping[
			
 
				+            table_defs[page]["coords_value_mapping"]
			
 
				+        ],
			
 
				+        # coords_value_filling=coords_value_filling,
			
 
				+        filter_remove=filter_remove,
			
 
				+        # filter_keep=filter_keep,
			
 
				+        meta_data=meta_data,
			
 
				+        convert_str=True,
			
 
				+        time_format="%Y",
			
 
				+    )
			
 
				+
			
 
				+    # conversion to PRIMAP2 native format
			
 
				+    data_page_pm2 = pm2.pm2io.from_interchange_format(data_page_if)
			
 
				+
			
 
				+    # combine with tables from other pages
			
 
				+    if data_pm2 is None:
			
 
				+        data_pm2 = data_page_pm2
			
 
				+    else:
			
 
				+        data_pm2 = data_pm2.pr.merge(data_page_pm2)
			
 
				+
			
 
				+# convert back to IF to have units in the fixed format
			
 
				+data_if = data_pm2.pr.to_interchange_format()
			
 
				+
			
 
				+# ###
			
 
				+# save data to IF and native format
			
 
				+# ###
			
 
				+if not output_folder.exists():
			
 
				+    output_folder.mkdir()
			
 
				+pm2.pm2io.write_interchange_format(
			
 
				+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
			
 
				+    data_if,
			
 
				+)
			
 
				+
			
 
				+encoding = {var: compression for var in data_pm2.data_vars}
			
 
				+data_pm2.pr.to_netcdf(
			
 
				+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
			
 
				+    encoding=encoding,
			
 
				+)
			
 
				+
			
 
				+#### continue here
			
 
				+
			
 
				+# ###
			
 
				+# ## process the data
			
 
				+# ###
			
 
				+data_proc_pm2 = data_pm2
			
 
				+
			
 
				+# actual processing
			
 
				+
			
 
				+data_proc_pm2 = process_data_for_country(
			
 
				+    data_proc_pm2,
			
 
				+    entities_to_ignore=[],
			
 
				+    gas_baskets=gas_baskets,
			
 
				+    processing_info_country=processing_info,
			
 
				+    cat_terminology_out=coords_terminologies_2006["category"],
			
 
				+    category_conversion=cat_conversion,
			
 
				+)
			
 
				+
			
 
				+# adapt source and metadata
			
 
				+current_source = data_proc_pm2.coords["source"].values[0]
			
 
				+data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
			
 
				+data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
			
 
				+
			
 
				+# ###
			
 
				+# save data to IF and native format
			
 
				+# ###
			
 
				+data_proc_if = data_proc_pm2.pr.to_interchange_format()
			
 
				+if not output_folder.exists():
			
 
				+    output_folder.mkdir()
			
 
				+pm2.pm2io.write_interchange_format(
			
 
				+    output_folder / (output_filename + coords_terminologies_2006["category"]),
			
 
				+    data_proc_if,
			
 
				+)
			
 
				+
			
 
				+encoding = {var: compression for var in data_proc_pm2.data_vars}
			
 
				+data_proc_pm2.pr.to_netcdf(
			
 
				+    output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
			
 
				+    encoding=encoding,
			
 
				+)
			
--- a/UNFCCC_GHG_data/helper/functions.py
+++ b/UNFCCC_GHG_data/helper/functions.py
@@ -15,76 +15,84 @@ from .definitions import root_path, downloaded_data_path, extracted_data_path
 
				 from .definitions import legacy_data_path, code_path
			
 
				 from .definitions import GWP_factors
			
 
				 
			
 
				+
			
 
				 def process_data_for_country(
			
 
				-        data_country: xr.Dataset,
			
 
				-        entities_to_ignore: List[str],
			
 
				-        gas_baskets: Dict[str, List[str]],
			
 
				-        filter_dims: Optional[Dict[str, List[str]]] = None,
			
 
				-        cat_terminology_out: Optional[str] = None,
			
 
				-        category_conversion: Dict[str, Dict] = None,
			
 
				-        sectors_out: List[str] = None,
			
 
				-        processing_info_country: Dict = None,
			
 
				+    data_country: xr.Dataset,
			
 
				+    entities_to_ignore: List[str],
			
 
				+    gas_baskets: Dict[str, List[str]],
			
 
				+    filter_dims: Optional[Dict[str, List[str]]] = None,
			
 
				+    cat_terminology_out: Optional[str] = None,
			
 
				+    category_conversion: Dict[str, Dict] = None,
			
 
				+    sectors_out: List[str] = None,
			
 
				+    processing_info_country: Dict = None,
			
 
				 ) -> xr.Dataset:
			
 
				     """
			
 
				-        Process data from DI interface (where necessary).
			
 
				-        * Downscaling including subtraction of time series
			
 
				-        * country specific sector aggregation
			
 
				-        * Conversion to IPCC2006 categories
			
 
				-        * general sector and gas basket aggregation (in new categories)
			
 
				+    Process data from DI interface (where necessary).
			
 
				+    * Downscaling including subtraction of time series
			
 
				+    * country specific sector aggregation
			
 
				+    * Conversion to IPCC2006 categories
			
 
				+    * general sector and gas basket aggregation (in new categories)
			
 
				     """
			
 
				 
			
 
				     # 0: gather information
			
 
				-    countries = list(data_country.coords[data_country.attrs['area']].values)
			
 
				+    countries = list(data_country.coords[data_country.attrs["area"]].values)
			
 
				     if len(countries) > 1:
			
 
				         raise ValueError(
			
 
				             f"Found {len(countries)} countries. Only single country data "
			
 
				-            f"can be processed by this function. countries: {countries}")
			
 
				+            f"can be processed by this function. countries: {countries}"
			
 
				+        )
			
 
				     else:
			
 
				         country_code = countries[0]
			
 
				 
			
 
				     # get category terminology
			
 
				-    cat_col = data_country.attrs['cat']
			
 
				-    temp = re.findall(r'\((.*)\)', cat_col)
			
 
				+    cat_col = data_country.attrs["cat"]
			
 
				+    temp = re.findall(r"\((.*)\)", cat_col)
			
 
				     cat_terminology_in = temp[0]
			
 
				 
			
 
				     # get scenario
			
 
				-    scenarios = list(data_country.coords[data_country.attrs['scen']].values)
			
 
				+    scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
			
 
				     if len(scenarios) > 1:
			
 
				         raise ValueError(
			
 
				             f"Found {len(scenarios)} scenarios. Only single scenario data "
			
 
				-            f"can be processed by this function. Scenarios: {scenarios}")
			
 
				+            f"can be processed by this function. Scenarios: {scenarios}"
			
 
				+        )
			
 
				     scenario = scenarios[0]
			
 
				 
			
 
				     # get source
			
 
				-    sources = list(data_country.coords['source'].values)
			
 
				+    sources = list(data_country.coords["source"].values)
			
 
				     if len(sources) > 1:
			
 
				         raise ValueError(
			
 
				             f"Found {len(sources)} sources. Only single source data "
			
 
				-            f"can be processed by this function. Sources: {sources}")
			
 
				+            f"can be processed by this function. Sources: {sources}"
			
 
				+        )
			
 
				     source = sources[0]
			
 
				 
			
 
				     # check if category name column present
			
 
				     # TODO: replace 'name' in config by  'additional_cols' dict that defines the cols
			
 
				     #  and the values
			
 
				-    if 'orig_cat_name' in data_country.coords:
			
 
				+    if "orig_cat_name" in data_country.coords:
			
 
				         cat_name_present = True
			
 
				     else:
			
 
				         cat_name_present = False
			
 
				 
			
 
				     # 1: general processing
			
 
				     # remove unused cats
			
 
				-    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
			
 
				+    data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
			
 
				     # remove unused years
			
 
				-    data_country = data_country.dropna(f'time', how='all')
			
 
				+    data_country = data_country.dropna(f"time", how="all")
			
 
				     # remove variables only containing nan
			
 
				-    nan_vars_country = [var for var in data_country.data_vars if
			
 
				-                        bool(data_country[var].isnull().all().data) is True]
			
 
				+    nan_vars_country = [
			
 
				+        var
			
 
				+        for var in data_country.data_vars
			
 
				+        if bool(data_country[var].isnull().all().data) is True
			
 
				+    ]
			
 
				     print(f"removing all-nan variables: {nan_vars_country}")
			
 
				     data_country = data_country.drop_vars(nan_vars_country)
			
 
				 
			
 
				     # remove unnecessary variables
			
 
				-    entities_ignore_present = [entity for entity in entities_to_ignore if
			
 
				-                               entity in data_country.data_vars]
			
 
				+    entities_ignore_present = [
			
 
				+        entity for entity in entities_to_ignore if entity in data_country.data_vars
			
 
				+    ]
			
 
				     data_country = data_country.drop_vars(entities_ignore_present)
			
 
				 
			
 
				     # filter ()
			
@@ -93,167 +101,200 @@ def process_data_for_country(
 
				 
			
 
				     # 2: country specific processing
			
 
				     if processing_info_country is not None:
			
 
				-
			
 
				-        if 'tolerance' in processing_info_country:
			
 
				+        if "tolerance" in processing_info_country:
			
 
				             tolerance = processing_info_country["tolerance"]
			
 
				         else:
			
 
				             tolerance = 0.01
			
 
				 
			
 
				         # remove entities if needed
			
 
				-        if 'ignore_entities' in processing_info_country:
			
 
				-            entities_to_ignore_country = processing_info_country[
			
 
				-                'ignore_entities']
			
 
				-            entities_ignore_present = \
			
 
				-                [entity for entity in entities_to_ignore_country if
			
 
				-                 entity in data_country.data_vars]
			
 
				+        if "ignore_entities" in processing_info_country:
			
 
				+            entities_to_ignore_country = processing_info_country["ignore_entities"]
			
 
				+            entities_ignore_present = [
			
 
				+                entity
			
 
				+                for entity in entities_to_ignore_country
			
 
				+                if entity in data_country.data_vars
			
 
				+            ]
			
 
				             data_country = data_country.drop_vars(entities_ignore_present)
			
 
				 
			
 
				         # take only desired years
			
 
				-        if 'years' in processing_info_country:
			
 
				+        if "years" in processing_info_country:
			
 
				             data_country = data_country.pr.loc[
			
 
				-                {'time': processing_info_country['years']}]
			
 
				+                {"time": processing_info_country["years"]}
			
 
				+            ]
			
 
				 
			
 
				         # remove timeseries if desired
			
 
				-        if 'remove_ts' in processing_info_country:
			
 
				-            for case in processing_info_country['remove_ts']:
			
 
				-                remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
			
 
				+        if "remove_ts" in processing_info_country:
			
 
				+            for case in processing_info_country["remove_ts"]:
			
 
				+                remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
			
 
				                 entities = remove_info.pop("entities")
			
 
				                 for entity in entities:
			
 
				-                    data_country[entity].pr.loc[remove_info] = \
			
 
				+                    data_country[entity].pr.loc[remove_info] = (
			
 
				                         data_country[entity].pr.loc[remove_info] * np.nan
			
 
				+                    )
			
 
				 
			
 
				         # remove all data for given years if necessary
			
 
				-        if 'remove_years' in processing_info_country:
			
 
				+        if "remove_years" in processing_info_country:
			
 
				             data_country = data_country.drop_sel(
			
 
				-                time=processing_info_country['remove_years'])
			
 
				+                time=processing_info_country["remove_years"]
			
 
				+            )
			
 
				 
			
 
				         # subtract categories
			
 
				-        if 'subtract_cats' in processing_info_country:
			
 
				-            subtract_cats_current = processing_info_country['subtract_cats']
			
 
				+        if "subtract_cats" in processing_info_country:
			
 
				+            subtract_cats_current = processing_info_country["subtract_cats"]
			
 
				             print(f"Subtracting categories for country {country_code}")
			
 
				             for cat_to_generate in subtract_cats_current:
			
 
				-                if 'entities' in subtract_cats_current[cat_to_generate].keys():
			
 
				-                    entities_current = subtract_cats_current[cat_to_generate]['entities']
			
 
				+                if "entities" in subtract_cats_current[cat_to_generate].keys():
			
 
				+                    entities_current = subtract_cats_current[cat_to_generate][
			
 
				+                        "entities"
			
 
				+                    ]
			
 
				                 else:
			
 
				                     entities_current = list(data_country.data_vars)
			
 
				 
			
 
				-                cats_to_subtract = \
			
 
				-                    subtract_cats_current[cat_to_generate]['subtract']
			
 
				-                data_sub = \
			
 
				-                    data_country[entities_current].pr.loc[
			
 
				-                        {'category': cats_to_subtract}].pr.sum(
			
 
				-                        dim='category', skipna=True, min_count=1)
			
 
				+                cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
			
 
				+                data_sub = (
			
 
				+                    data_country[entities_current]
			
 
				+                    .pr.loc[{"category": cats_to_subtract}]
			
 
				+                    .pr.sum(dim="category", skipna=True, min_count=1)
			
 
				+                )
			
 
				                 data_parent = data_country[entities_current].pr.loc[
			
 
				-                    {'category': subtract_cats_current[cat_to_generate]['parent']}]
			
 
				+                    {"category": subtract_cats_current[cat_to_generate]["parent"]}
			
 
				+                ]
			
 
				                 data_agg = data_parent - data_sub
			
 
				-                nan_vars = [var for var in data_agg.data_vars if
			
 
				-                            data_agg[var].isnull().all().data is True]
			
 
				+                nan_vars = [
			
 
				+                    var
			
 
				+                    for var in data_agg.data_vars
			
 
				+                    if data_agg[var].isnull().all().data is True
			
 
				+                ]
			
 
				                 data_agg = data_agg.drop(nan_vars)
			
 
				                 if len(data_agg.data_vars) > 0:
			
 
				                     print(f"Generating {cat_to_generate} through subtraction")
			
 
				-                    data_agg = data_agg.expand_dims([f'category ('
			
 
				-                                                     f'{cat_terminology_in})'])
			
 
				+                    data_agg = data_agg.expand_dims(
			
 
				+                        [f"category (" f"{cat_terminology_in})"]
			
 
				+                    )
			
 
				 
			
 
				                     data_agg = data_agg.assign_coords(
			
 
				-                        coords={f'category ({cat_terminology_in})':
			
 
				-                                    (f'category ({cat_terminology_in})',
			
 
				-                                     [cat_to_generate])})
			
 
				+                        coords={
			
 
				+                            f"category ({cat_terminology_in})": (
			
 
				+                                f"category ({cat_terminology_in})",
			
 
				+                                [cat_to_generate],
			
 
				+                            )
			
 
				+                        }
			
 
				+                    )
			
 
				                     if cat_name_present:
			
 
				-                        cat_name = subtract_cats_current[cat_to_generate]['name']
			
 
				+                        cat_name = subtract_cats_current[cat_to_generate]["name"]
			
 
				                         data_agg = data_agg.assign_coords(
			
 
				-                            coords={'orig_cat_name':
			
 
				-                                        (f'category ({cat_terminology_in})',
			
 
				-                                         [cat_name])})
			
 
				-                    data_country = data_country.pr.merge(data_agg,
			
 
				-                                                         tolerance=tolerance)
			
 
				+                            coords={
			
 
				+                                "orig_cat_name": (
			
 
				+                                    f"category ({cat_terminology_in})",
			
 
				+                                    [cat_name],
			
 
				+                                )
			
 
				+                            }
			
 
				+                        )
			
 
				+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
			
 
				                 else:
			
 
				                     print(f"no data to generate category {cat_to_generate}")
			
 
				 
			
 
				         # downscaling
			
 
				-        if 'downscale' in processing_info_country:
			
 
				-            if 'sectors' in processing_info_country['downscale']:
			
 
				-                sector_downscaling = \
			
 
				-                    processing_info_country['downscale']['sectors']
			
 
				+        if "downscale" in processing_info_country:
			
 
				+            if "sectors" in processing_info_country["downscale"]:
			
 
				+                sector_downscaling = processing_info_country["downscale"]["sectors"]
			
 
				                 for case in sector_downscaling.keys():
			
 
				                     print(f"Downscaling for {case}.")
			
 
				                     sector_downscaling_current = sector_downscaling[case]
			
 
				-                    entities = sector_downscaling_current.pop('entities')
			
 
				+                    entities = sector_downscaling_current.pop("entities")
			
 
				                     for entity in entities:
			
 
				                         data_country[entity] = data_country[
			
 
				-                            entity].pr.downscale_timeseries(
			
 
				-                            **sector_downscaling_current)
			
 
				+                            entity
			
 
				+                        ].pr.downscale_timeseries(**sector_downscaling_current)
			
 
				                         # , skipna_evaluation_dims=None)
			
 
				 
			
 
				-            if 'entities' in processing_info_country['downscale']:
			
 
				-                entity_downscaling = \
			
 
				-                    processing_info_country['downscale']['entities']
			
 
				+            if "entities" in processing_info_country["downscale"]:
			
 
				+                entity_downscaling = processing_info_country["downscale"]["entities"]
			
 
				                 for case in entity_downscaling.keys():
			
 
				                     print(f"Downscaling for {case}.")
			
 
				                     # print(data_country.coords[f'category ('
			
 
				                     #                          f'{cat_terminology_in})'].values)
			
 
				                     data_country = data_country.pr.downscale_gas_timeseries(
			
 
				-                        **entity_downscaling[case], skipna=True,
			
 
				-                        skipna_evaluation_dims=None)
			
 
				+                        **entity_downscaling[case],
			
 
				+                        skipna=True,
			
 
				+                        skipna_evaluation_dims=None,
			
 
				+                    )
			
 
				 
			
 
				         # aggregate categories
			
 
				-        if 'aggregate_cats' in processing_info_country:
			
 
				-            if 'agg_tolerance' in processing_info_country:
			
 
				-                agg_tolerance = processing_info_country['agg_tolerance']
			
 
				+        if "aggregate_cats" in processing_info_country:
			
 
				+            if "agg_tolerance" in processing_info_country:
			
 
				+                agg_tolerance = processing_info_country["agg_tolerance"]
			
 
				             else:
			
 
				                 agg_tolerance = tolerance
			
 
				-            aggregate_cats_current = processing_info_country['aggregate_cats']
			
 
				+            aggregate_cats_current = processing_info_country["aggregate_cats"]
			
 
				             print(
			
 
				                 f"Aggregating categories for country {country_code}, source {source}, "
			
 
				-                f"scenario {scenario}")
			
 
				+                f"scenario {scenario}"
			
 
				+            )
			
 
				             for cat_to_agg in aggregate_cats_current:
			
 
				                 print(f"Category: {cat_to_agg}")
			
 
				-                source_cats = aggregate_cats_current[cat_to_agg]['sources']
			
 
				-                data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
			
 
				-                    dim='category', skipna=True, min_count=1)
			
 
				-                nan_vars = [var for var in data_agg.data_vars if
			
 
				-                            data_agg[var].isnull().all().data is True]
			
 
				+                source_cats = aggregate_cats_current[cat_to_agg]["sources"]
			
 
				+                data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
			
 
				+                    dim="category", skipna=True, min_count=1
			
 
				+                )
			
 
				+                nan_vars = [
			
 
				+                    var
			
 
				+                    for var in data_agg.data_vars
			
 
				+                    if data_agg[var].isnull().all().data is True
			
 
				+                ]
			
 
				                 data_agg = data_agg.drop(nan_vars)
			
 
				                 if len(data_agg.data_vars) > 0:
			
 
				-                    data_agg = data_agg.expand_dims([f'category ('
			
 
				-                                                     f'{cat_terminology_in})'])
			
 
				+                    data_agg = data_agg.expand_dims(
			
 
				+                        [f"category (" f"{cat_terminology_in})"]
			
 
				+                    )
			
 
				                     data_agg = data_agg.assign_coords(
			
 
				-                        coords={f'category ({cat_terminology_in})':
			
 
				-                                    (f'category ({cat_terminology_in})',
			
 
				-                                     [cat_to_agg])})
			
 
				+                        coords={
			
 
				+                            f"category ({cat_terminology_in})": (
			
 
				+                                f"category ({cat_terminology_in})",
			
 
				+                                [cat_to_agg],
			
 
				+                            )
			
 
				+                        }
			
 
				+                    )
			
 
				                     if cat_name_present:
			
 
				-                        cat_name = aggregate_cats_current[cat_to_agg]['name']
			
 
				+                        cat_name = aggregate_cats_current[cat_to_agg]["name"]
			
 
				                         data_agg = data_agg.assign_coords(
			
 
				-                            coords={'orig_cat_name':
			
 
				-                                        (f'category ({cat_terminology_in})',
			
 
				-                                         [cat_name])})
			
 
				-                    data_country = data_country.pr.merge(data_agg,
			
 
				-                                                         tolerance=agg_tolerance)
			
 
				+                            coords={
			
 
				+                                "orig_cat_name": (
			
 
				+                                    f"category ({cat_terminology_in})",
			
 
				+                                    [cat_name],
			
 
				+                                )
			
 
				+                            }
			
 
				+                        )
			
 
				+                    data_country = data_country.pr.merge(
			
 
				+                        data_agg, tolerance=agg_tolerance
			
 
				+                    )
			
 
				                 else:
			
 
				                     print(f"no data to aggregate category {cat_to_agg}")
			
 
				 
			
 
				         # copy HFCs and PFCs with default factors
			
 
				-        if 'basket_copy' in processing_info_country:
			
 
				+        if "basket_copy" in processing_info_country:
			
 
				             GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
			
 
				             entities = processing_info_country["basket_copy"]["entities"]
			
 
				             source_GWP = processing_info_country["basket_copy"]["source_GWP"]
			
 
				             for entity in entities:
			
 
				-                data_source = data_country[f'{entity} ({source_GWP})']
			
 
				+                data_source = data_country[f"{entity} ({source_GWP})"]
			
 
				                 for GWP in GWPs_to_add:
			
 
				-                    data_GWP = data_source * \
			
 
				-                               GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
			
 
				+                    data_GWP = (
			
 
				+                        data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
			
 
				+                    )
			
 
				                     data_GWP.attrs["entity"] = entity
			
 
				                     data_GWP.attrs["gwp_context"] = GWP
			
 
				                     data_country[f"{entity} ({GWP})"] = data_GWP
			
 
				 
			
 
				         # aggregate gases if desired
			
 
				-        if 'aggregate_gases' in processing_info_country:
			
 
				+        if "aggregate_gases" in processing_info_country:
			
 
				             # TODO: why use different code here than below. Can this fill non-existen
			
 
				             #  gas baskets?
			
 
				-            for case in processing_info_country['aggregate_gases'].keys():
			
 
				-                case_info = processing_info_country['aggregate_gases'][case]
			
 
				-                data_country[case_info['basket']] = \
			
 
				-                    data_country.pr.fill_na_gas_basket_from_contents(
			
 
				-                        **case_info)
			
 
				+            for case in processing_info_country["aggregate_gases"].keys():
			
 
				+                case_info = processing_info_country["aggregate_gases"][case]
			
 
				+                data_country[
			
 
				+                    case_info["basket"]
			
 
				+                ] = data_country.pr.fill_na_gas_basket_from_contents(**case_info)
			
 
				 
			
 
				     # 3: map categories
			
 
				     if category_conversion is not None:
			
@@ -270,61 +311,74 @@ def process_data_for_country(
 
				     # more general processing
			
 
				     # reduce categories to output cats
			
 
				     if sectors_out is not None:
			
 
				-        cats_to_keep = [cat for cat in
			
 
				-                        data_country.coords[f'category ({cat_terminology_out})'].values
			
 
				-                        if cat in sectors_out]
			
 
				-        data_country = data_country.pr.loc[{'category': cats_to_keep}]
			
 
				+        cats_to_keep = [
			
 
				+            cat
			
 
				+            for cat in data_country.coords[f"category ({cat_terminology_out})"].values
			
 
				+            if cat in sectors_out
			
 
				+        ]
			
 
				+        data_country = data_country.pr.loc[{"category": cats_to_keep}]
			
 
				 
			
 
				     # create gas baskets
			
 
				     entities_present = set(data_country.data_vars)
			
 
				     for basket in gas_baskets.keys():
			
 
				-        basket_contents_present = [gas for gas in gas_baskets[basket] if
			
 
				-                                   gas in entities_present]
			
 
				+        basket_contents_present = [
			
 
				+            gas for gas in gas_baskets[basket] if gas in entities_present
			
 
				+        ]
			
 
				         if len(basket_contents_present) > 0:
			
 
				             if basket in list(data_country.data_vars):
			
 
				                 data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
			
 
				-                    basket=basket, basket_contents=basket_contents_present,
			
 
				-                    skipna=True, min_count=1)
			
 
				+                    basket=basket,
			
 
				+                    basket_contents=basket_contents_present,
			
 
				+                    skipna=True,
			
 
				+                    min_count=1,
			
 
				+                )
			
 
				             else:
			
 
				                 try:
			
 
				-                    #print(data_country.data_vars)
			
 
				-                    data_country[basket] = xr.full_like(data_country["CO2"],
			
 
				-                                                        np.nan).pr.quantify(
			
 
				-                        units="Gg CO2 / year")
			
 
				-                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
			
 
				-                                                  "gwp_context": basket.split(' ')[1][
			
 
				-                                                                 1:-1]}
			
 
				+                    # print(data_country.data_vars)
			
 
				+                    data_country[basket] = xr.full_like(
			
 
				+                        data_country["CO2"], np.nan
			
 
				+                    ).pr.quantify(units="Gg CO2 / year")
			
 
				+                    data_country[basket].attrs = {
			
 
				+                        "entity": basket.split(" ")[0],
			
 
				+                        "gwp_context": basket.split(" ")[1][1:-1],
			
 
				+                    }
			
 
				                     data_country[basket] = data_country.pr.gas_basket_contents_sum(
			
 
				-                        basket=basket, basket_contents=basket_contents_present,
			
 
				-                        min_count=1)
			
 
				+                        basket=basket,
			
 
				+                        basket_contents=basket_contents_present,
			
 
				+                        min_count=1,
			
 
				+                    )
			
 
				                     entities_present.add(basket)
			
 
				                 except Exception as ex:
			
 
				-                    print(f"No gas basket created for {country_code}, {source}, "
			
 
				-                          f"{scenario}: {ex}")
			
 
				+                    print(
			
 
				+                        f"No gas basket created for {country_code}, {source}, "
			
 
				+                        f"{scenario}: {ex}"
			
 
				+                    )
			
 
				 
			
 
				     # amend title and comment
			
 
				-    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
			
 
				-                                                                    f"{date.today()}"
			
 
				-    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
			
 
				-                                                                    f"{date.today()}"
			
 
				+    data_country.attrs["comment"] = (
			
 
				+        data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
			
 
				+    )
			
 
				+    data_country.attrs["title"] = (
			
 
				+        data_country.attrs["title"] + f" Processed on " f"{date.today()}"
			
 
				+    )
			
 
				 
			
 
				     return data_country
			
 
				 
			
 
				 
			
 
				 def convert_categories(
			
 
				-        ds_input: xr.Dataset,
			
 
				-        conversion: Dict[str, Dict[str, str]],
			
 
				-        #terminology_from: str,
			
 
				-        terminology_to: str,
			
 
				-        debug: bool=False,
			
 
				-        tolerance: float=0.01,
			
 
				-)->xr.Dataset:
			
 
				+    ds_input: xr.Dataset,
			
 
				+    conversion: Dict[str, Dict[str, str]],
			
 
				+    # terminology_from: str,
			
 
				+    terminology_to: str,
			
 
				+    debug: bool = False,
			
 
				+    tolerance: float = 0.01,
			
 
				+) -> xr.Dataset:
			
 
				     """
			
 
				     convert data from one category terminology to another
			
 
				     """
			
 
				     print(f"converting categories to {terminology_to}")
			
 
				 
			
 
				-    if 'orig_cat_name' in ds_input.coords:
			
 
				+    if "orig_cat_name" in ds_input.coords:
			
 
				         cat_name_present = True
			
 
				     else:
			
 
				         cat_name_present = False
			
@@ -338,50 +392,67 @@ def convert_categories(
 
				     ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
			
 
				 
			
 
				     # find categories present in dataset
			
 
				-    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
			
 
				+    cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
			
 
				 
			
 
				     # restrict categories and map category names
			
 
				-    if 'mapping' in conversion.keys():
			
 
				-        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
			
 
				-                                cat in cats_present]
			
 
				-        ds_converted = ds_converted.pr.loc[
			
 
				-            {'category': mapping_cats_present}]
			
 
				-
			
 
				-        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
			
 
				-        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
			
 
				-        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
			
 
				-                                                   (f'category ({terminology_to})',
			
 
				-                                                    to_cats)})
			
 
				+    if "mapping" in conversion.keys():
			
 
				+        mapping_cats_present = [
			
 
				+            cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
			
 
				+        ]
			
 
				+        ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
			
 
				+
			
 
				+        from_cats = ds_converted.coords[f"category ({terminology_to})"].values
			
 
				+        to_cats = pd.Series(from_cats).replace(conversion["mapping"])
			
 
				+        ds_converted = ds_converted.assign_coords(
			
 
				+            {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
			
 
				+        )
			
 
				 
			
 
				     # redo the list of present cats after mapping, as we have new categories in the
			
 
				     # target terminology now
			
 
				-    cats_present_mapped = list(ds_converted.coords[f'category ('
			
 
				-                                                   f'{terminology_to})'].values)
			
 
				+    cats_present_mapped = list(
			
 
				+        ds_converted.coords[f"category (" f"{terminology_to})"].values
			
 
				+    )
			
 
				     # aggregate categories
			
 
				-    if 'aggregate' in conversion:
			
 
				-        aggregate_cats = conversion['aggregate']
			
 
				+    if "aggregate" in conversion:
			
 
				+        aggregate_cats = conversion["aggregate"]
			
 
				         for cat_to_agg in aggregate_cats:
			
 
				             if debug:
			
 
				                 print(f"Category: {cat_to_agg}")
			
 
				-            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
			
 
				-                           cat in cats_present_mapped]
			
 
				+            source_cats = [
			
 
				+                cat
			
 
				+                for cat in aggregate_cats[cat_to_agg]["sources"]
			
 
				+                if cat in cats_present_mapped
			
 
				+            ]
			
 
				             if debug:
			
 
				                 print(source_cats)
			
 
				-            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
			
 
				-                dim='category', skipna=True, min_count=1)
			
 
				-            nan_vars = [var for var in data_agg.data_vars if
			
 
				-                        data_agg[var].isnull().all().data == True]
			
 
				+            data_agg = ds_converted.pr.loc[{"category": source_cats}].pr.sum(
			
 
				+                dim="category", skipna=True, min_count=1
			
 
				+            )
			
 
				+            nan_vars = [
			
 
				+                var
			
 
				+                for var in data_agg.data_vars
			
 
				+                if data_agg[var].isnull().all().data == True
			
 
				+            ]
			
 
				             data_agg = data_agg.drop(nan_vars)
			
 
				             if len(data_agg.data_vars) > 0:
			
 
				-                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
			
 
				+                data_agg = data_agg.expand_dims([f"category ({terminology_to})"])
			
 
				                 data_agg = data_agg.assign_coords(
			
 
				-                    coords={f'category ({terminology_to})':
			
 
				-                                (f'category ({terminology_to})', [cat_to_agg])})
			
 
				+                    coords={
			
 
				+                        f"category ({terminology_to})": (
			
 
				+                            f"category ({terminology_to})",
			
 
				+                            [cat_to_agg],
			
 
				+                        )
			
 
				+                    }
			
 
				+                )
			
 
				                 if cat_name_present:
			
 
				                     data_agg = data_agg.assign_coords(
			
 
				-                        coords={'orig_cat_name':
			
 
				-                                    (f'category ({terminology_to})',
			
 
				-                                     [aggregate_cats[cat_to_agg]['name']])})
			
 
				+                        coords={
			
 
				+                            "orig_cat_name": (
			
 
				+                                f"category ({terminology_to})",
			
 
				+                                [aggregate_cats[cat_to_agg]["name"]],
			
 
				+                            )
			
 
				+                        }
			
 
				+                    )
			
 
				                 ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
			
 
				                 cats_present_mapped.append(cat_to_agg)
			
 
				             else:
			
@@ -391,9 +462,9 @@ def convert_categories(
 
				 
			
 
				 
			
 
				 def get_country_name(
			
 
				-        country_code: str,
			
 
				+    country_code: str,
			
 
				 ) -> str:
			
 
				-    """get country name from code """
			
 
				+    """get country name from code"""
			
 
				     if country_code in custom_country_mapping:
			
 
				         country_name = custom_country_mapping[country_code]
			
 
				     else:
			
@@ -401,15 +472,16 @@ def get_country_name(
 
				             country = pycountry.countries.get(alpha_3=country_code)
			
 
				             country_name = country.name
			
 
				         except:
			
 
				-            raise ValueError(f"Country code {country_code} can not be mapped to "
			
 
				-                             f"any country")
			
 
				+            raise ValueError(
			
 
				+                f"Country code {country_code} can not be mapped to " f"any country"
			
 
				+            )
			
 
				 
			
 
				     return country_name
			
 
				 
			
 
				 
			
 
				 def get_country_code(
			
 
				-        country_name: str,
			
 
				-)->str:
			
 
				+    country_name: str,
			
 
				+) -> str:
			
 
				     """
			
 
				     obtain country code. If the input is a code it will be returned,
			
 
				     if the input
			
@@ -435,28 +507,31 @@ def get_country_code(
 
				             country_code = country.alpha_3
			
 
				         except:
			
 
				             try:
			
 
				-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
			
 
				+                country = pycountry.countries.search_fuzzy(
			
 
				+                    country_name.replace("_", " ")
			
 
				+                )
			
 
				             except:
			
 
				-                raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
			
 
				+                raise ValueError(
			
 
				+                    f"Country name {country_name} can not be mapped to "
			
 
				+                    f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly."
			
 
				+                )
			
 
				             if len(country) > 1:
			
 
				                 country_code = None
			
 
				                 for current_country in country:
			
 
				                     if current_country.name == country_name:
			
 
				                         country_code = current_country.alpha_3
			
 
				                 if country_code is None:
			
 
				-                    raise ValueError(f"Country name {country_name} has {len(country)} "
			
 
				-                                     f"possible results for country codes.")
			
 
				+                    raise ValueError(
			
 
				+                        f"Country name {country_name} has {len(country)} "
			
 
				+                        f"possible results for country codes."
			
 
				+                    )
			
 
				 
			
 
				             country_code = country[0].alpha_3
			
 
				 
			
 
				     return country_code
			
 
				 
			
 
				 
			
 
				-def create_folder_mapping(
			
 
				-        folder: str,
			
 
				-        extracted: bool = False
			
 
				-) -> None:
			
 
				+def create_folder_mapping(folder: str, extracted: bool = False) -> None:
			
 
				     """
			
 
				     Create a mapping from 3 letter ISO country codes to folders
			
 
				     based on the subfolders of the given folder. The mapping is
			
@@ -480,9 +555,9 @@ def create_folder_mapping(
 
				 
			
 
				     folder = root_path / folder
			
 
				     folder_mapping = {}
			
 
				-    #if not extracted:
			
 
				+    # if not extracted:
			
 
				     known_folders = custom_folders
			
 
				-    #else:
			
 
				+    # else:
			
 
				     #    known_folders = {}
			
 
				 
			
 
				     for item in folder.iterdir():
			
@@ -491,7 +566,9 @@ def create_folder_mapping(
 
				                 ISO3 = known_folders[item.name]
			
 
				             else:
			
 
				                 try:
			
 
				-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
			
 
				+                    country = pycountry.countries.search_fuzzy(
			
 
				+                        item.name.replace("_", " ")
			
 
				+                    )
			
 
				                     if len(country) > 1:
			
 
				                         ISO3 = None
			
 
				                         for current_country in country:
			
@@ -516,8 +593,8 @@ def create_folder_mapping(
 
				 
			
 
				 # TODO add crf
			
 
				 def get_country_submissions(
			
 
				-        country_name: str,
			
 
				-        print_sub: bool = True,
			
 
				+    country_name: str,
			
 
				+    print_sub: bool = True,
			
 
				 ) -> Dict[str, List[str]]:
			
 
				     """
			
 
				     Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
			
@@ -585,8 +662,8 @@ def get_country_submissions(
 
				 
			
 
				 
			
 
				 def get_country_datasets(
			
 
				-        country_name: str,
			
 
				-        print_ds: bool = True,
			
 
				+    country_name: str,
			
 
				+    print_ds: bool = True,
			
 
				 ) -> Dict[str, List[str]]:
			
 
				     """
			
 
				     Input is a three letter ISO code for a country, or the country's name.
			
@@ -638,35 +715,42 @@ def get_country_datasets(
 
				             else:
			
 
				                 country_folder = folder_mapping[country_code]
			
 
				                 if not isinstance(country_folder, str):
			
 
				-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+                    raise ValueError(
			
 
				+                        "Wrong data type in folder mapping json file. Should be str."
			
 
				+                    )
			
 
				 
			
 
				                 datasets_current_folder = {}
			
 
				                 current_folder = item / country_folder
			
 
				 
			
 
				                 for data_file in current_folder.iterdir():
			
 
				-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
			
 
				                         if data_file.stem in datasets_current_folder:
			
 
				-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                            datasets_current_folder[data_file.stem].append(
			
 
				+                                data_file.suffix
			
 
				+                            )
			
 
				                         else:
			
 
				                             datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				 
			
 
				                 for dataset in datasets_current_folder:
			
 
				                     # process filename to get submission
			
 
				-                    parts = dataset.split('_')
			
 
				+                    parts = dataset.split("_")
			
 
				                     if parts[0] != country_code:
			
 
				-                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
			
 
				-                            dataset
			
 
				+                        cleaned_datasets_current_folder[
			
 
				+                            f"Wrong code: {parts[0]}"
			
 
				+                        ] = dataset
			
 
				                     else:
			
 
				-                        terminology = "_".join(parts[3 : ])
			
 
				+                        terminology = "_".join(parts[3:])
			
 
				                         key = f"{parts[1]} ({parts[2]}, {terminology})"
			
 
				                         data_info = ""
			
 
				-                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                        if ".nc" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "NF (.nc), "
			
 
				-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                        if (".csv" in datasets_current_folder[dataset]) and (
			
 
				+                            ".yaml" in datasets_current_folder[dataset]
			
 
				+                        ):
			
 
				                             data_info = data_info + "IF (.yaml + .csv), "
			
 
				-                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                        elif ".csv" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "incomplete IF? (.csv), "
			
 
				-                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                        elif ".yaml" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "incomplete IF (.yaml), "
			
 
				 
			
 
				                         code_file = get_code_file(country_code, parts[1])
			
@@ -680,7 +764,9 @@ def get_country_datasets(
 
				                 if print_ds:
			
 
				                     if cleaned_datasets_current_folder:
			
 
				                         for country_ds in cleaned_datasets_current_folder:
			
 
				-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                            print(
			
 
				+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
			
 
				+                            )
			
 
				                     else:
			
 
				                         print("No data available")
			
 
				                     print("")
			
@@ -708,34 +794,42 @@ def get_country_datasets(
 
				             else:
			
 
				                 country_folder = folder_mapping[country_code]
			
 
				                 if not isinstance(country_folder, str):
			
 
				-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+                    raise ValueError(
			
 
				+                        "Wrong data type in folder mapping json file. Should be str."
			
 
				+                    )
			
 
				 
			
 
				                 datasets_current_folder = {}
			
 
				                 current_folder = item / country_folder
			
 
				 
			
 
				                 for data_file in current_folder.iterdir():
			
 
				-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
			
 
				                         if data_file.stem in datasets_current_folder:
			
 
				-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                            datasets_current_folder[data_file.stem].append(
			
 
				+                                data_file.suffix
			
 
				+                            )
			
 
				                         else:
			
 
				                             datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				 
			
 
				                 for dataset in datasets_current_folder:
			
 
				                     # process filename to get submission
			
 
				-                    parts = dataset.split('_')
			
 
				+                    parts = dataset.split("_")
			
 
				                     if parts[0] != country_code:
			
 
				-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
			
 
				+                        cleaned_datasets_current_folder[
			
 
				+                            f"Wrong UNFCCC_GHG_data: {parts[0]}"
			
 
				+                        ] = dataset
			
 
				                     else:
			
 
				-                        terminology = "_".join(parts[3 : ])
			
 
				+                        terminology = "_".join(parts[3:])
			
 
				                         key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
			
 
				                         data_info = ""
			
 
				-                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                        if ".nc" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "NF (.nc), "
			
 
				-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                        if (".csv" in datasets_current_folder[dataset]) and (
			
 
				+                            ".yaml" in datasets_current_folder[dataset]
			
 
				+                        ):
			
 
				                             data_info = data_info + "IF (.yaml + .csv), "
			
 
				-                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                        elif ".csv" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "incomplete IF? (.csv), "
			
 
				-                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                        elif ".yaml" in datasets_current_folder[dataset]:
			
 
				                             data_info = data_info + "incomplete IF (.yaml), "
			
 
				 
			
 
				                         cleaned_datasets_current_folder[key] = data_info
			
@@ -743,7 +837,9 @@ def get_country_datasets(
 
				                 if print_ds:
			
 
				                     if cleaned_datasets_current_folder:
			
 
				                         for country_ds in cleaned_datasets_current_folder:
			
 
				-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                            print(
			
 
				+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
			
 
				+                            )
			
 
				                     else:
			
 
				                         print("No data available")
			
 
				                     print("")
			
@@ -759,9 +855,9 @@ def get_country_datasets(
 
				 
			
 
				 
			
 
				 def get_code_file(
			
 
				-        country_name: str,
			
 
				-        submission: str,
			
 
				-        print_info: bool = False,
			
 
				+    country_name: str,
			
 
				+    submission: str,
			
 
				+    print_info: bool = False,
			
 
				 ) -> Path:
			
 
				     """
			
 
				     For given country name and submission find the script that creates the data
			
@@ -813,13 +909,17 @@ def get_code_file(
 
				         for file in country_folder.iterdir():
			
 
				             if file.match(code_file_name_candidate):
			
 
				                 if code_file_path is not None:
			
 
				-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
			
 
				-                                     f"{code_file_path} and file.name. "
			
 
				-                                     f"Please use only one file with name "
			
 
				-                                     f"'read_ISO3_submission_XXX.YYY'.")
			
 
				+                    raise ValueError(
			
 
				+                        f"Found multiple UNFCCC_GHG_data candidates: "
			
 
				+                        f"{code_file_path} and file.name. "
			
 
				+                        f"Please use only one file with name "
			
 
				+                        f"'read_ISO3_submission_XXX.YYY'."
			
 
				+                    )
			
 
				                 else:
			
 
				                     if print_info:
			
 
				-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
			
 
				+                        print(
			
 
				+                            f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}"
			
 
				+                        )
			
 
				                 code_file_path = file
			
 
				 
			
 
				     if code_file_path is not None:
			
@@ -828,8 +928,10 @@ def get_code_file(
 
				         return None
			
 
				 
			
 
				 
			
 
				-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
			
 
				-    '''
			
 
				+def fix_rows(
			
 
				+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
			
 
				+) -> pd.DataFrame:
			
 
				+    """
			
 
				     Function to fix rows that have been split during reading from pdf
			
 
				     This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
			
 
				 
			
@@ -838,18 +940,20 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
				     :param col_to_use:
			
 
				     :param n_rows:
			
 
				     :return:
			
 
				-    '''
			
 
				+    """
			
 
				     for row in rows_to_fix:
			
 
				-        #print(row)
			
 
				+        # print(row)
			
 
				         # find the row number and collect the row and the next two rows
			
 
				         index = data.loc[data[col_to_use] == row].index
			
 
				-        #print(list(index))
			
 
				+        # print(list(index))
			
 
				         if not list(index):
			
 
				             print(f"Can't merge split row {row}")
			
 
				             print(data[col_to_use])
			
 
				-        #print(f"Merging split row {row} for table {page}")
			
 
				+        # print(f"Merging split row {row} for table {page}")
			
 
				         loc = data.index.get_loc(index[0])
			
 
				-        if n_rows == -3:
			
 
				+        if n_rows == -2:
			
 
				+            locs_to_merge = list(range(loc - 1, loc + 1))
			
 
				+        elif n_rows == -3:
			
 
				             locs_to_merge = list(range(loc - 1, loc + 2))
			
 
				         elif n_rows == -5:
			
 
				             locs_to_merge = list(range(loc - 1, loc + 4))
			
@@ -858,7 +962,7 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
				         rows_to_merge = data.iloc[locs_to_merge]
			
 
				         indices_to_merge = rows_to_merge.index
			
 
				         # join the three rows
			
 
				-        new_row = rows_to_merge.agg(' '.join)
			
 
				+        new_row = rows_to_merge.agg(" ".join)
			
 
				         # replace the double spaces that are created
			
 
				         # must be done here and not at the end as splits are not always
			
 
				         # the same and join would produce different col values
			
@@ -866,6 +970,10 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
				         new_row = new_row.str.replace("N O", "NO")
			
 
				         new_row = new_row.str.replace(", N", ",N")
			
 
				         new_row = new_row.str.replace("- ", "-")
			
 
				+        # replace spaces in numbers
			
 
				+        pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
			
 
				+        repl = lambda m: f"{m.group('first')}{m.group('last')}"
			
 
				+        new_row = new_row.str.replace(pat, repl, regex=True)
			
 
				         data.loc[indices_to_merge[0]] = new_row
			
 
				         data = data.drop(indices_to_merge[1:])
			
 
				-    return data
			
 
				+    return data
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,6 @@ requires = [
 
				 ]
			
 
				 build-backend = "setuptools.build_meta"
			
 
				 
			
 
				+[tool.black]
			
 
				+line-length = 88
			
 
				+
			
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,7 +30,7 @@ packages =
 
				     UNFCCC_GHG_data.UNFCCC_downloader
			
 
				     UNFCCC_GHG_data.UNFCCC_DI_reader
			
 
				     UNFCCC_GHG_data.helper
			
 
				-    #UNFCCC_GHG_data.datasets
			
 
				+#UNFCCC_GHG_data.datasets
			
 
				 python_requires = >=3.8
			
 
				 setup_requires =
			
 
				     setuptools_scm
			
@@ -70,6 +70,7 @@ dev =
 
				     jupyter
			
 
				     dask
			
 
				     ipympl
			
 
				+    black
			
 
				 
			
 
				 
			
 
				 [options.package_data]