il y a 1 an · 61639b9f1e
--- a/UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Peru/config_PER_BUR3.py
@@ -0,0 +1,560 @@
 
															+table_def_templates = {
														
 
															+    "300": {  # 300
														
 
															+        "area": ["69,457,727,78"],
														
 
															+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la fabricación",
														
 
															+                "Productos no energéticos de combustibles y de uso",
														
 
															+                "Uso de productos sustitutos de las sustancias que",
														
 
															+            ],
														
 
															+            2: [
														
 
															+                "1A Actividades de quema de combustible",
														
 
															+                "2A Industria de los minerales",
														
 
															+                "2B Industria química",
														
 
															+                "2C Industria de los metales",
														
 
															+                "2E Industria electrónica",
														
 
															+                "3A Ganado",
														
 
															+                "3A1 Fermentación entérica",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "301": {  # 301
														
 
															+        "area": ["72,542,727,99"],
														
 
															+        "cols": ["288,352,391,426,458,485,519,552,587,615,643"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Fuentes agregadas y fuentes de emisión no CO2 de",
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo del",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+            2: [
														
 
															+                "3A2 Manejo del estiércol",
														
 
															+                "3C1 Emisiones por quema de biomasa",
														
 
															+                "3C3 Aplicación de urea",
														
 
															+                "3C7 Cultivo de arroz",
														
 
															+                "A Disposición de residuos sólidos",
														
 
															+                "B Tratamiento biológico de residuos",
														
 
															+                "C Incineración de residuos",
														
 
															+                "D Tratamiento y descarga de aguas residuales",
														
 
															+                "Búnker internacional",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "302": {  # 302
														
 
															+        "area": ["72,510,727,79"],
														
 
															+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la fabricación",
														
 
															+                "Productos no energéticos de combustibles y de",
														
 
															+                "Uso de productos sustitutos de las sustancias que",
														
 
															+                "Fuentes agregadas y fuentes de emision no CO2",
														
 
															+            ],
														
 
															+            -3: ["Total de las emisiones y remociones nacionales"],
														
 
															+        },
														
 
															+    },
														
 
															+    "303": {  # 303
														
 
															+        "area": ["72,540,727,127"],
														
 
															+        "cols": ["278,335,376,415,453,482,512,548,585,623,656"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+            2: ["Aviación internacional"],
														
 
															+        },
														
 
															+    },
														
 
															+    "304": {  # 304
														
 
															+        "area": ["72,510,727,70"],
														
 
															+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la",
														
 
															+                "Productos no energéticos de combustibles y de",
														
 
															+                "Uso de productos sustitutos de las sustancias",
														
 
															+                "Fuentes agregadas y fuentes de emisión no CO2",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "305": {  # 305
														
 
															+        "area": ["72,540,727,108"],
														
 
															+        "cols": ["275,332,365,408,441,470,499,533,577,620,654"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "306": {  # 306
														
 
															+        "area": ["72,510,727,70"],
														
 
															+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la",
														
 
															+                "Productos no energéticos de combustibles y",
														
 
															+                "Uso de productos sustitutos de las sustancias",
														
 
															+                "Fuentes agregadas y fuentes de emisión no",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "307": {  # 307
														
 
															+        "area": ["72,540,727,108"],
														
 
															+        "cols": ["266,320,364,405,440,468,499,536,576,620,656"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "308": {  # 308
														
 
															+        "area": ["72,510,727,70"],
														
 
															+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la fabricación",
														
 
															+                "Productos no energéticos de combustibles y de",
														
 
															+                "Uso de productos sustitutos de las sustancias que",
														
 
															+                "Fuentes agregadas y fuentes de emisión no CO2",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "309": {  # 309
														
 
															+        "area": ["72,540,727,117"],
														
 
															+        "cols": ["278,329,372,406,441,470,500,536,579,621,653"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo del",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "310": {  # 310
														
 
															+        "area": ["72,510,727,70"],
														
 
															+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Industrias manufactureras y de la",
														
 
															+                "Emisiones fugitivas provenientes de la fabricación",
														
 
															+                "Productos no energéticos de combustibles y de",
														
 
															+                "Uso de productos sustitutos de las sustancias que",
														
 
															+                "Fuentes agregadas y fuentes de emisión no CO2",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "311": {  # 311
														
 
															+        "area": ["72,540,727,110"],
														
 
															+        "cols": ["279,334,379,418,453,480,505,541,582,620,654"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones directas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+            -2: ["Emisiones de CO2 de la biomasa"],
														
 
															+        },
														
 
															+    },
														
 
															+    "312": {  # 312
														
 
															+        "area": ["72,510,727,70"],
														
 
															+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones fugitivas provenientes de la fabricación de",
														
 
															+                "Productos no energéticos de combustibles y de uso de",
														
 
															+                "Uso de productos sustitutos de las sustancias que",
														
 
															+                "Fuentes agregadas y fuentes de emisión no CO2 de la",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+    "313": {  # 313
														
 
															+        "area": ["72,540,727,90"],
														
 
															+        "cols": ["297,349,393,426,461,489,514,547,592,629,657"],
														
 
															+        "rows_to_fix": {
														
 
															+            3: [
														
 
															+                "Emisiones indirectas de N2O en suelos",
														
 
															+                "Emisiones indirectas de N2O por manejo del",
														
 
															+                "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y",
														
 
															+            ],
														
 
															+        },
														
 
															+    },
														
 
															+}
														
 
															+
														
 
															+header = {
														
 
															+    "entity": [
														
 
															+        "Categorías de emisiones y sumideros de GEI",
														
 
															+        "Emisiones/remociones netas de CO2",
														
 
															+        "CH4",
														
 
															+        "N2O",
														
 
															+        "HFC",
														
 
															+        "PFC",
														
 
															+        "SF6",
														
 
															+        "CO",
														
 
															+        "NOx",
														
 
															+        "COVDM",
														
 
															+        "SOX",
														
 
															+        "Emisiones/remociones totales de GEI",
														
 
															+    ],
														
 
															+    "unit": [
														
 
															+        "",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "GgCO2eq",
														
 
															+        "GgCO2eq",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "Gg",
														
 
															+        "GgCO2eq",
														
 
															+    ],
														
 
															+}
														
 
															+
														
 
															+table_defs = {
														
 
															+    "300": {
														
 
															+        "templates": ["300"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4, 5],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2000,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "301": {
														
 
															+        "templates": ["301"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4, 5],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2000,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "302": {
														
 
															+        "templates": ["302"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2005,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "303": {
														
 
															+        "templates": ["303"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2005,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "304": {
														
 
															+        "templates": ["304"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2010,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "305": {
														
 
															+        "templates": ["305"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2010,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "306": {
														
 
															+        "templates": ["306"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2012,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "307": {
														
 
															+        "templates": ["307"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2012,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "308": {
														
 
															+        "templates": ["308"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2014,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "309": {
														
 
															+        "templates": ["309"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2014,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "310": {
														
 
															+        "templates": ["310"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2016,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "311": {
														
 
															+        "templates": ["311"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2016,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "312": {
														
 
															+        "templates": ["312"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2019,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+    "313": {
														
 
															+        "templates": ["313"],
														
 
															+        "header": header,
														
 
															+        "drop_rows": [0, 1, 2, 3, 4],
														
 
															+        "category_col": "Categorías de emisiones y sumideros de GEI",
														
 
															+        "year": 2019,
														
 
															+        "coords_value_mapping": "default",
														
 
															+    },
														
 
															+}
														
 
															+
														
 
															+cat_names_fix = {
														
 
															+    "Industrias manufactureras y de la 1A2 construcción":
														
 
															+        "1A2 Industrias manufactureras y de la construcción",
														
 
															+    "Emisiones fugitivas provenientes de la fabricación 1B de combustibles":
														
 
															+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
														
 
															+    "Emisiones fugitivas provenientes de la 1B fabricación de combustibles":
														
 
															+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
														
 
															+    "Emisiones fugitivas provenientes de la fabricación de 1B combustibles":
														
 
															+        "1B Emisiones fugitivas provenientes de la fabricación de combustibles",
														
 
															+    "Productos no energéticos de combustibles y de uso 2D de solventes":
														
 
															+        "2D Productos no energéticos de combustibles y de uso de solventes",
														
 
															+    "Productos no energéticos de combustibles y de 2D uso de solventes":
														
 
															+        "2D Productos no energéticos de combustibles y de uso de solventes",
														
 
															+    "Uso de productos sustitutos de las sustancias que 2F agotan la capa de ozono":
														
 
															+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
														
 
															+    "Uso de productos sustitutos de las sustancias 2F que agotan la capa de ozono":
														
 
															+        "2F Uso de productos sustitutos de las sustancias que agotan la capa de ozono",
														
 
															+    "Fuentes agregadas y fuentes de emisión no CO2 de 3C la tierra":
														
 
															+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
														
 
															+    "Fuentes agregadas y fuentes de emision no CO2 3C de la tierra":
														
 
															+        "3C Fuentes agregadas y fuentes de emision no CO2 de la tierra",
														
 
															+    "Fuentes agregadas y fuentes de emisión no CO2 3C de la tierra":
														
 
															+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
														
 
															+    "Fuentes agregadas y fuentes de emisión no 3C CO2 de la tierra":
														
 
															+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
														
 
															+    "Fuentes agregadas y fuentes de emisión no CO2 de la 3C tierra":
														
 
															+        "3C Fuentes agregadas y fuentes de emisión no CO2 de la tierra",
														
 
															+    "Emisiones directas de N2O en suelos 3C4 gestionados":
														
 
															+        "3C4 Emisiones directas de N2O en suelos gestionados",
														
 
															+    "Emisiones indirectas de N2O en suelos 3C5 gestionados":
														
 
															+        "3C5 Emisiones indirectas de N2O en suelos gestionados",
														
 
															+    "Emisiones indirectas de N2O por manejo del 3C6 estiércol":
														
 
															+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
														
 
															+    "Emisiones indirectas de N2O por manejo 3C6 del estiércol":
														
 
															+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
														
 
															+    "Emisiones indirectas de N2O por 3C6 manejo del estiércol":
														
 
															+        "3C6 Emisiones indirectas de N2O por manejo del estiércol",
														
 
															+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y 4 SILVICULTURA":
														
 
															+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
														
 
															+    "USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA 4 Y SILVICULTURA":
														
 
															+        "4 USO DE LA TIERRA, CAMBIO DE USO DE LA TIERRA Y SILVICULTURA",
														
 
															+}
														
 
															+
														
 
															+values_replacement = {
														
 
															+    #    '': '-',
														
 
															+    " ": "",
														
 
															+}
														
 
															+
														
 
															+gwp_to_use = "AR5GWP100"
														
 
															+
														
 
															+index_cols = ["orig_cat_name"]
														
 
															+cols_for_space_stripping = index_cols
														
 
															+
														
 
															+unit_row = "header"
														
 
															+
														
 
															+## parameters part 2: conversion to PRIMAP2 interchnage format
														
 
															+
														
 
															+cats_remove = ["Partidas informativas"]
														
 
															+
														
 
															+cat_codes_manual = {
														
 
															+    "Emisiones de CO2 de la biomasa": "M.BIO",
														
 
															+    "Total de las emisiones y remociones nacionales": "0",
														
 
															+    "Búnker internacional": "M.BK",
														
 
															+    "Aviación internacional": "M.BK.A",
														
 
															+    "Transporte marítimo y fluvial internacional": "M.BK.M",
														
 
															+    "A Disposición de residuos sólidos": "5.A",
														
 
															+    "B Tratamiento biológico de residuos": "5.B",
														
 
															+    "C Incineración de residuos": "5.C",
														
 
															+    "D Tratamiento y descarga de aguas residuales": "5.D",
														
 
															+    "Tierras": "M.2006.3.B",
														
 
															+}
														
 
															+
														
 
															+
														
 
															+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
														
 
															+
														
 
															+# special header as category code and name in one column
														
 
															+header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
														
 
															+
														
 
															+coords_terminologies = {
														
 
															+    "area": "ISO3",
														
 
															+    "category": "IPCC1996_2006_PER_INV",
														
 
															+    "scenario": "PRIMAP",
														
 
															+}
														
 
															+
														
 
															+coords_terminologies_2006 = {
														
 
															+    "area": "ISO3",
														
 
															+    "category": "IPCC2006_PRIMAP",
														
 
															+    "scenario": "PRIMAP",
														
 
															+}
														
 
															+
														
 
															+coords_defaults = {
														
 
															+    "source": "PER-GHG-inventory ",
														
 
															+    "provenance": "measured",
														
 
															+    "area": "PER",
														
 
															+    "scenario": "BUR3",
														
 
															+}
														
 
															+
														
 
															+coords_value_mapping = {
														
 
															+    "default": {
														
 
															+        "unit": "PRIMAP1",
														
 
															+        "entity": {
														
 
															+            "Emisiones/remociones netas de CO2": "CO2",
														
 
															+            "CH4": "CH4",
														
 
															+            "N2O": "N2O",
														
 
															+            "HFC": f"HFCS ({gwp_to_use})",
														
 
															+            "PFC": f"PFCS ({gwp_to_use})",
														
 
															+            "SF6": "SF6",
														
 
															+            "CO": "CO",
														
 
															+            "NOx": "NOX",
														
 
															+            "COVDM": "NMVOC",
														
 
															+            "SOx": "SOX",
														
 
															+            "Emisiones/remociones totales de GEI": f"KYOTOGHG ({gwp_to_use})",
														
 
															+        },
														
 
															+    },
														
 
															+}
														
 
															+
														
 
															+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
														
 
															+
														
 
															+add_coords_cols = {
														
 
															+    "orig_cat_name": ["orig_cat_name", "category"],
														
 
															+}
														
 
															+
														
 
															+filter_remove = {
														
 
															+    # "f1" :{
														
 
															+    #     "entity": ["HFC-125", "HFC-134a", "HFC-143a", "HFC-152a", "HFC-227ea",
														
 
															+    #                "HFC-23", "HFC-32", "HFC-41", "HFC-43-10mee", "PFC-116",
														
 
															+    #                "PFC-14", "PFC-218", "PFC-318", "NF3", "SF6"],
														
 
															+    #     "category": "2"
														
 
															+    # }
														
 
															+}
														
 
															+
														
 
															+meta_data = {
														
 
															+    "references": "https://unfccc.int/documents/",
														
 
															+    "rights": "",
														
 
															+    "contact": "mail@johannes-guetschow.de",
														
 
															+    "title": "",
														
 
															+    "comment": "Read fom pdf file by Johannes Gütschow",
														
 
															+    "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
														
 
															+}
														
 
															+
														
 
															+
														
 
															+## processing
														
 
															+cat_conversion = {
														
 
															+    'mapping': {
														
 
															+        '0': '0',
														
 
															+        '1': '1',
														
 
															+        '1.A': '1.A',
														
 
															+        '1.A.1': '1.A.1',
														
 
															+        '1.A.2': '1.A.2',
														
 
															+        '1.A.3': '1.A.3',
														
 
															+        '1.A.4': '1.A.4',
														
 
															+        '1.A.5': '1.A.5',
														
 
															+        '1.B': '1.B',
														
 
															+        '1.B.1': '1.B.1',
														
 
															+        '1.B.2': '1.B.2',
														
 
															+        '2': '2',
														
 
															+        '2.A': '2.A',
														
 
															+        '2.B': '2.B',
														
 
															+        '2.C': '2.C',
														
 
															+        '2.D': '2.D',
														
 
															+        '2.E': '2.E',
														
 
															+        '2.F': '2.F',
														
 
															+        '2.G': '2.G',
														
 
															+        '2.H': '2.H',
														
 
															+        '3': 'M.AG',
														
 
															+        '3.A': '3.A',
														
 
															+        '3.A.1': '3.A.1',
														
 
															+        '3.A.2': '3.A.2',
														
 
															+        '3.C': '3.C',
														
 
															+        '3.C.1': '3.C.1',
														
 
															+        '3.C.2': '3.C.2',
														
 
															+        '3.C.3': '3.C.3',
														
 
															+        '3.C.4': '3.C.4',
														
 
															+        '3.C.5': '3.C.5',
														
 
															+        '3.C.6': '3.C.6',
														
 
															+        '3.C.7': '3.C.7',
														
 
															+        '4': 'M.LULUCF',
														
 
															+        'M.2006.3.B': '3.B',
														
 
															+        '4.A': '3.B.1',
														
 
															+        '4.B': '3.B.2',
														
 
															+        '4.C': '3.B.3',
														
 
															+        '4.D': '3.B.4',
														
 
															+        '4.E': '3.B.5',
														
 
															+        '4.F': '3.B.6',
														
 
															+        '4.G': '3.D.1',
														
 
															+        '5': '4',
														
 
															+        '5.A': '4.A',
														
 
															+        '5.B': '4.B',
														
 
															+        '5.C': '4.C',
														
 
															+        '5.D': '4.D',
														
 
															+        'M.BK': 'M.BK',
														
 
															+        'M.BK.A': 'M.BK.A',
														
 
															+        'M.BK.M': 'M.BM.M',
														
 
															+        'M.BIO': 'M.BIO',
														
 
															+    },
														
 
															+    'aggregate': {
														
 
															+        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
														
 
															+              'name': 'IPPU'},
														
 
															+        'M.3.C.AG': {
														
 
															+            'sources': ['3.C'],
														
 
															+            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
														
 
															+        'M.AG.ELV': {'sources': ['M.3.C.AG'],
														
 
															+                     'name': 'Agriculture excluding livestock emissions'},
														
 
															+        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
														
 
															+        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
														
 
															+    },
														
 
															+}
														
 
															+
														
 
															+processing_info = {
														
 
															+    'basket_copy': {
														
 
															+        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
														
 
															+        'entities': ["HFCS", "PFCS"],
														
 
															+        'source_GWP': gwp_to_use,
														
 
															+    },
														
 
															+}
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Peru/read_PER_BUR3_from_pdf.py
@@ -0,0 +1,290 @@
 
															+# read Singapore fifth BUR from pdf
														
 
															+
														
 
															+
														
 
															+import camelot
														
 
															+import primap2 as pm2
														
 
															+import pandas as pd
														
 
															+
														
 
															+import locale
														
 
															+
														
 
															+from UNFCCC_GHG_data.helper import process_data_for_country, gas_baskets
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															+from UNFCCC_GHG_data.helper import fix_rows
														
 
															+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
														
 
															+from config_PER_BUR3 import table_def_templates, table_defs, index_cols
														
 
															+from config_PER_BUR3 import values_replacement, header_long, cats_remove
														
 
															+from config_PER_BUR3 import cat_codes_manual, cat_code_regexp, cat_names_fix
														
 
															+from config_PER_BUR3 import coords_cols, coords_terminologies, coords_defaults
														
 
															+from config_PER_BUR3 import coords_terminologies_2006
														
 
															+from config_PER_BUR3 import coords_value_mapping, meta_data, filter_remove
														
 
															+from config_PER_BUR3 import processing_info, cat_conversion
														
 
															+
														
 
															+### general configuration
														
 
															+input_folder = downloaded_data_path / "UNFCCC" / "Peru" / "BUR3"
														
 
															+output_folder = extracted_data_path / "UNFCCC" / "Peru"
														
 
															+if not output_folder.exists():
														
 
															+    output_folder.mkdir()
														
 
															+
														
 
															+output_filename = "PER_BUR3_2023_"
														
 
															+inventory_file_pdf = "Tercer_BUR_Per%C3%BA_Jun2023.pdf"
														
 
															+# years_to_read = range(1990, 2018 + 1)
														
 
															+
														
 
															+# define locale to use for str to float conversion
														
 
															+locale_to_use = "es_PE.UTF-8"
														
 
															+locale.setlocale(locale.LC_NUMERIC, locale_to_use)
														
 
															+
														
 
															+pagesToRead = table_defs.keys()
														
 
															+
														
 
															+compression = dict(zlib=True, complevel=9)
														
 
															+
														
 
															+## part 1: read the data from pdf
														
 
															+### part 1.a: 2016 inventory
														
 
															+
														
 
															+data_pm2 = None
														
 
															+for page in pagesToRead:
														
 
															+    print(f"++++++++++++++++++++++++++++++++")
														
 
															+    print(f"+++++ Working on page {page} ++++++")
														
 
															+    print(f"++++++++++++++++++++++++++++++++")
														
 
															+
														
 
															+    df_this_page = None
														
 
															+    for table_on_page in table_defs[page]["templates"]:
														
 
															+        print(f"Reading table {table_on_page}")
														
 
															+        area = table_def_templates[table_on_page]["area"]
														
 
															+        cols = table_def_templates[table_on_page]["cols"]
														
 
															+        tables = camelot.read_pdf(
														
 
															+            str(input_folder / inventory_file_pdf),
														
 
															+            pages=str(page),
														
 
															+            flavor="stream",
														
 
															+            table_areas=area,
														
 
															+            columns=cols,
														
 
															+        )
														
 
															+
														
 
															+        df_current = tables[0].df.copy(deep=True)
														
 
															+        # drop the old header
														
 
															+        if "drop_rows" in table_defs[page].keys():
														
 
															+            df_current = df_current.drop(table_defs[page]["drop_rows"])
														
 
															+        elif "drop_rows" in table_def_templates[table_on_page].keys():
														
 
															+            df_current = df_current.drop(
														
 
															+                table_def_templates[table_on_page]["drop_rows"]
														
 
															+            )
														
 
															+        # add new header
														
 
															+        if "header" in table_defs[page].keys():
														
 
															+            df_current.columns = pd.MultiIndex.from_tuples(
														
 
															+                zip(
														
 
															+                    table_defs[page]["header"]["entity"],
														
 
															+                    table_defs[page]["header"]["unit"],
														
 
															+                )
														
 
															+            )
														
 
															+        else:
														
 
															+            df_current.columns = pd.MultiIndex.from_tuples(
														
 
															+                zip(
														
 
															+                    table_def_templates[table_on_page]["header"]["entity"],
														
 
															+                    table_def_templates[table_on_page]["header"]["unit"],
														
 
															+                )
														
 
															+            )
														
 
															+
														
 
															+        # drop cols if necessary
														
 
															+        if "drop_cols" in table_defs[page].keys():
														
 
															+            # print(df_current.columns.values)
														
 
															+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
														
 
															+        elif "drop_cols" in table_def_templates[table_on_page].keys():
														
 
															+            df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
														
 
															+
														
 
															+        # rename category column
														
 
															+        df_current.rename(
														
 
															+            columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
														
 
															+        )
														
 
															+
														
 
															+        # replace double \n
														
 
															+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
														
 
															+        # replace double and triple spaces
														
 
															+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
														
 
															+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
														
 
															+
														
 
															+        # fix the split rows
														
 
															+        for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
														
 
															+            df_current = fix_rows(
														
 
															+                df_current,
														
 
															+                table_def_templates[table_on_page]["rows_to_fix"][n_rows],
														
 
															+                index_cols[0],
														
 
															+                n_rows,
														
 
															+            )
														
 
															+
														
 
															+        # replace category names with typos
														
 
															+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
														
 
															+
														
 
															+        # replace empty stings
														
 
															+        df_current = df_current.replace(values_replacement)
														
 
															+
														
 
															+        # set index
														
 
															+        # df_current = df_current.set_index(index_cols)
														
 
															+        # strip trailing and leading  and remove "^"
														
 
															+        for col in df_current.columns.values:
														
 
															+            df_current[col] = df_current[col].str.strip()
														
 
															+            df_current[col] = df_current[col].str.replace("^", "")
														
 
															+
														
 
															+        # print(df_current)
														
 
															+        # aggregate dfs for this page
														
 
															+        if df_this_page is None:
														
 
															+            df_this_page = df_current.copy(deep=True)
														
 
															+        else:
														
 
															+            # find intersecting cols
														
 
															+            cols_this_page = df_this_page.columns.values
														
 
															+            # print(f"cols this page: {cols_this_page}")
														
 
															+            cols_current = df_current.columns.values
														
 
															+            # print(f"cols current: {cols_current}")
														
 
															+            cols_both = list(set(cols_this_page).intersection(set(cols_current)))
														
 
															+            # print(f"cols both: {cols_both}")
														
 
															+            if len(cols_both) > 0:
														
 
															+                df_this_page = df_this_page.merge(
														
 
															+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
														
 
															+                )
														
 
															+            else:
														
 
															+                df_this_page = df_this_page.merge(
														
 
															+                    df_current,
														
 
															+                    how="outer",
														
 
															+                    left_index=True,
														
 
															+                    right_index=True,
														
 
															+                    suffixes=(None, None),
														
 
															+                )
														
 
															+
														
 
															+            df_this_page = df_this_page.groupby(index_cols).first().reset_index()
														
 
															+            # print(df_this_page)
														
 
															+            # df_all = df_all.join(df_current, how='outer')
														
 
															+
														
 
															+    # set index and convert to long format
														
 
															+    df_this_page = df_this_page.set_index(index_cols)
														
 
															+    df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
														
 
															+        df_this_page, table_defs[page]["year"], header_long
														
 
															+    )
														
 
															+
														
 
															+    # drop the rows with memo items etc
														
 
															+    for cat in cats_remove:
														
 
															+        df_this_page_long = df_this_page_long.drop(
														
 
															+            df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
														
 
															+        )
														
 
															+
														
 
															+    # make a copy of the categories row
														
 
															+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
														
 
															+
														
 
															+    # replace cat names by codes in col "Categories"
														
 
															+    # first the manual replacements
														
 
															+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
														
 
															+        cat_codes_manual
														
 
															+    )
														
 
															+    # then the regex replacements
														
 
															+    repl = lambda m: convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
														
 
															+    df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
														
 
															+        :, "category"
														
 
															+    ].str.replace(cat_code_regexp, repl, regex=True)
														
 
															+    df_this_page_long.loc[:, "category"].unique()
														
 
															+
														
 
															+    # strip spaces in data col
														
 
															+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.strip()
														
 
															+
														
 
															+    df_this_page_long = df_this_page_long.reset_index(drop=True)
														
 
															+
														
 
															+    # make sure all col headers are str
														
 
															+    df_this_page_long.columns = df_this_page_long.columns.map(str)
														
 
															+
														
 
															+    # remove thousands separators as pd.to_numeric can't deal with that
														
 
															+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
														
 
															+        ".", ""
														
 
															+    )
														
 
															+    pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
														
 
															+    repl = lambda m: f"{m.group('first')}.{m.group('last')}"
														
 
															+    df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
														
 
															+        pat, repl, regex=True
														
 
															+    )
														
 
															+
														
 
															+    # df_this_page_long["data"] = df_this_page_long["data"].str.replace("^.$","",
														
 
															+    #                                                                   regex=True)
														
 
															+
														
 
															+    # drop orig cat name as it's not unique over all tables (keep until here in case
														
 
															+    # it's needed for debugging)
														
 
															+    df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
														
 
															+
														
 
															+    data_page_if = pm2.pm2io.convert_long_dataframe_if(
														
 
															+        df_this_page_long,
														
 
															+        coords_cols=coords_cols,
														
 
															+        # add_coords_cols=add_coords_cols,
														
 
															+        coords_defaults=coords_defaults,
														
 
															+        coords_terminologies=coords_terminologies,
														
 
															+        coords_value_mapping=coords_value_mapping[
														
 
															+            table_defs[page]["coords_value_mapping"]
														
 
															+        ],
														
 
															+        # coords_value_filling=coords_value_filling,
														
 
															+        filter_remove=filter_remove,
														
 
															+        # filter_keep=filter_keep,
														
 
															+        meta_data=meta_data,
														
 
															+        convert_str=True,
														
 
															+        time_format="%Y",
														
 
															+    )
														
 
															+
														
 
															+    # conversion to PRIMAP2 native format
														
 
															+    data_page_pm2 = pm2.pm2io.from_interchange_format(data_page_if)
														
 
															+
														
 
															+    # combine with tables from other pages
														
 
															+    if data_pm2 is None:
														
 
															+        data_pm2 = data_page_pm2
														
 
															+    else:
														
 
															+        data_pm2 = data_pm2.pr.merge(data_page_pm2)
														
 
															+
														
 
															+# convert back to IF to have units in the fixed format
														
 
															+data_if = data_pm2.pr.to_interchange_format()
														
 
															+
														
 
															+# ###
														
 
															+# save data to IF and native format
														
 
															+# ###
														
 
															+if not output_folder.exists():
														
 
															+    output_folder.mkdir()
														
 
															+pm2.pm2io.write_interchange_format(
														
 
															+    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
														
 
															+    data_if,
														
 
															+)
														
 
															+
														
 
															+encoding = {var: compression for var in data_pm2.data_vars}
														
 
															+data_pm2.pr.to_netcdf(
														
 
															+    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
														
 
															+    encoding=encoding,
														
 
															+)
														
 
															+
														
 
															+#### continue here
														
 
															+
														
 
															+# ###
														
 
															+# ## process the data
														
 
															+# ###
														
 
															+data_proc_pm2 = data_pm2
														
 
															+
														
 
															+# actual processing
														
 
															+
														
 
															+data_proc_pm2 = process_data_for_country(
														
 
															+    data_proc_pm2,
														
 
															+    entities_to_ignore=[],
														
 
															+    gas_baskets=gas_baskets,
														
 
															+    processing_info_country=processing_info,
														
 
															+    cat_terminology_out=coords_terminologies_2006["category"],
														
 
															+    category_conversion=cat_conversion,
														
 
															+)
														
 
															+
														
 
															+# adapt source and metadata
														
 
															+current_source = data_proc_pm2.coords["source"].values[0]
														
 
															+data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
														
 
															+data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
														
 
															+
														
 
															+# ###
														
 
															+# save data to IF and native format
														
 
															+# ###
														
 
															+data_proc_if = data_proc_pm2.pr.to_interchange_format()
														
 
															+if not output_folder.exists():
														
 
															+    output_folder.mkdir()
														
 
															+pm2.pm2io.write_interchange_format(
														
 
															+    output_folder / (output_filename + coords_terminologies_2006["category"]),
														
 
															+    data_proc_if,
														
 
															+)
														
 
															+
														
 
															+encoding = {var: compression for var in data_proc_pm2.data_vars}
														
 
															+data_proc_pm2.pr.to_netcdf(
														
 
															+    output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
														
 
															+    encoding=encoding,
														
 
															+)
														
--- a/UNFCCC_GHG_data/helper/functions.py
+++ b/UNFCCC_GHG_data/helper/functions.py
@@ -15,76 +15,84 @@ from .definitions import root_path, downloaded_data_path, extracted_data_path
 
															 from .definitions import legacy_data_path, code_path
														
 
															 from .definitions import GWP_factors
														
 
															+
														
 
															 def process_data_for_country(
														
 
															-        data_country: xr.Dataset,
														
 
															-        entities_to_ignore: List[str],
														
 
															-        gas_baskets: Dict[str, List[str]],
														
 
															-        filter_dims: Optional[Dict[str, List[str]]] = None,
														
 
															-        cat_terminology_out: Optional[str] = None,
														
 
															-        category_conversion: Dict[str, Dict] = None,
														
 
															-        sectors_out: List[str] = None,
														
 
															-        processing_info_country: Dict = None,
														
 
															+    data_country: xr.Dataset,
														
 
															+    entities_to_ignore: List[str],
														
 
															+    gas_baskets: Dict[str, List[str]],
														
 
															+    filter_dims: Optional[Dict[str, List[str]]] = None,
														
 
															+    cat_terminology_out: Optional[str] = None,
														
 
															+    category_conversion: Dict[str, Dict] = None,
														
 
															+    sectors_out: List[str] = None,
														
 
															+    processing_info_country: Dict = None,
														
 
															 ) -> xr.Dataset:
														
 
															     """
														
 
															-        Process data from DI interface (where necessary).
														
 
															-        * Downscaling including subtraction of time series
														
 
															-        * country specific sector aggregation
														
 
															-        * Conversion to IPCC2006 categories
														
 
															-        * general sector and gas basket aggregation (in new categories)
														
 
															+    Process data from DI interface (where necessary).
														
 
															+    * Downscaling including subtraction of time series
														
 
															+    * country specific sector aggregation
														
 
															+    * Conversion to IPCC2006 categories
														
 
															+    * general sector and gas basket aggregation (in new categories)
														
 
															     """
														
 
															     # 0: gather information
														
 
															-    countries = list(data_country.coords[data_country.attrs['area']].values)
														
 
															+    countries = list(data_country.coords[data_country.attrs["area"]].values)
														
 
															     if len(countries) > 1:
														
 
															         raise ValueError(
														
 
															             f"Found {len(countries)} countries. Only single country data "
														
 
															-            f"can be processed by this function. countries: {countries}")
														
 
															+            f"can be processed by this function. countries: {countries}"
														
 
															+        )
														
 
															     else:
														
 
															         country_code = countries[0]
														
 
															     # get category terminology
														
 
															-    cat_col = data_country.attrs['cat']
														
 
															-    temp = re.findall(r'\((.*)\)', cat_col)
														
 
															+    cat_col = data_country.attrs["cat"]
														
 
															+    temp = re.findall(r"\((.*)\)", cat_col)
														
 
															     cat_terminology_in = temp[0]
														
 
															     # get scenario
														
 
															-    scenarios = list(data_country.coords[data_country.attrs['scen']].values)
														
 
															+    scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
														
 
															     if len(scenarios) > 1:
														
 
															         raise ValueError(
														
 
															             f"Found {len(scenarios)} scenarios. Only single scenario data "
														
 
															-            f"can be processed by this function. Scenarios: {scenarios}")
														
 
															+            f"can be processed by this function. Scenarios: {scenarios}"
														
 
															+        )
														
 
															     scenario = scenarios[0]
														
 
															     # get source
														
 
															-    sources = list(data_country.coords['source'].values)
														
 
															+    sources = list(data_country.coords["source"].values)
														
 
															     if len(sources) > 1:
														
 
															         raise ValueError(
														
 
															             f"Found {len(sources)} sources. Only single source data "
														
 
															-            f"can be processed by this function. Sources: {sources}")
														
 
															+            f"can be processed by this function. Sources: {sources}"
														
 
															+        )
														
 
															     source = sources[0]
														
 
															     # check if category name column present
														
 
															     # TODO: replace 'name' in config by  'additional_cols' dict that defines the cols
														
 
															     #  and the values
														
 
															-    if 'orig_cat_name' in data_country.coords:
														
 
															+    if "orig_cat_name" in data_country.coords:
														
 
															         cat_name_present = True
														
 
															     else:
														
 
															         cat_name_present = False
														
 
															     # 1: general processing
														
 
															     # remove unused cats
														
 
															-    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
														
 
															+    data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
														
 
															     # remove unused years
														
 
															-    data_country = data_country.dropna(f'time', how='all')
														
 
															+    data_country = data_country.dropna(f"time", how="all")
														
 
															     # remove variables only containing nan
														
 
															-    nan_vars_country = [var for var in data_country.data_vars if
														
 
															-                        bool(data_country[var].isnull().all().data) is True]
														
 
															+    nan_vars_country = [
														
 
															+        var
														
 
															+        for var in data_country.data_vars
														
 
															+        if bool(data_country[var].isnull().all().data) is True
														
 
															+    ]
														
 
															     print(f"removing all-nan variables: {nan_vars_country}")
														
 
															     data_country = data_country.drop_vars(nan_vars_country)
														
 
															     # remove unnecessary variables
														
 
															-    entities_ignore_present = [entity for entity in entities_to_ignore if
														
 
															-                               entity in data_country.data_vars]
														
 
															+    entities_ignore_present = [
														
 
															+        entity for entity in entities_to_ignore if entity in data_country.data_vars
														
 
															+    ]
														
 
															     data_country = data_country.drop_vars(entities_ignore_present)
														
 
															     # filter ()
														
@@ -93,167 +101,200 @@ def process_data_for_country(
 
															     # 2: country specific processing
														
 
															     if processing_info_country is not None:
														
 
															-
														
 
															-        if 'tolerance' in processing_info_country:
														
 
															+        if "tolerance" in processing_info_country:
														
 
															             tolerance = processing_info_country["tolerance"]
														
 
															         else:
														
 
															             tolerance = 0.01
														
 
															         # remove entities if needed
														
 
															-        if 'ignore_entities' in processing_info_country:
														
 
															-            entities_to_ignore_country = processing_info_country[
														
 
															-                'ignore_entities']
														
 
															-            entities_ignore_present = \
														
 
															-                [entity for entity in entities_to_ignore_country if
														
 
															-                 entity in data_country.data_vars]
														
 
															+        if "ignore_entities" in processing_info_country:
														
 
															+            entities_to_ignore_country = processing_info_country["ignore_entities"]
														
 
															+            entities_ignore_present = [
														
 
															+                entity
														
 
															+                for entity in entities_to_ignore_country
														
 
															+                if entity in data_country.data_vars
														
 
															+            ]
														
 
															             data_country = data_country.drop_vars(entities_ignore_present)
														
 
															         # take only desired years
														
 
															-        if 'years' in processing_info_country:
														
 
															+        if "years" in processing_info_country:
														
 
															             data_country = data_country.pr.loc[
														
 
															-                {'time': processing_info_country['years']}]
														
 
															+                {"time": processing_info_country["years"]}
														
 
															+            ]
														
 
															         # remove timeseries if desired
														
 
															-        if 'remove_ts' in processing_info_country:
														
 
															-            for case in processing_info_country['remove_ts']:
														
 
															-                remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
														
 
															+        if "remove_ts" in processing_info_country:
														
 
															+            for case in processing_info_country["remove_ts"]:
														
 
															+                remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
														
 
															                 entities = remove_info.pop("entities")
														
 
															                 for entity in entities:
														
 
															-                    data_country[entity].pr.loc[remove_info] = \
														
 
															+                    data_country[entity].pr.loc[remove_info] = (
														
 
															                         data_country[entity].pr.loc[remove_info] * np.nan
														
 
															+                    )
														
 
															         # remove all data for given years if necessary
														
 
															-        if 'remove_years' in processing_info_country:
														
 
															+        if "remove_years" in processing_info_country:
														
 
															             data_country = data_country.drop_sel(
														
 
															-                time=processing_info_country['remove_years'])
														
 
															+                time=processing_info_country["remove_years"]
														
 
															+            )
														
 
															         # subtract categories
														
 
															-        if 'subtract_cats' in processing_info_country:
														
 
															-            subtract_cats_current = processing_info_country['subtract_cats']
														
 
															+        if "subtract_cats" in processing_info_country:
														
 
															+            subtract_cats_current = processing_info_country["subtract_cats"]
														
 
															             print(f"Subtracting categories for country {country_code}")
														
 
															             for cat_to_generate in subtract_cats_current:
														
 
															-                if 'entities' in subtract_cats_current[cat_to_generate].keys():
														
 
															-                    entities_current = subtract_cats_current[cat_to_generate]['entities']
														
 
															+                if "entities" in subtract_cats_current[cat_to_generate].keys():
														
 
															+                    entities_current = subtract_cats_current[cat_to_generate][
														
 
															+                        "entities"
														
 
															+                    ]
														
 
															                 else:
														
 
															                     entities_current = list(data_country.data_vars)
														
 
															-                cats_to_subtract = \
														
 
															-                    subtract_cats_current[cat_to_generate]['subtract']
														
 
															-                data_sub = \
														
 
															-                    data_country[entities_current].pr.loc[
														
 
															-                        {'category': cats_to_subtract}].pr.sum(
														
 
															-                        dim='category', skipna=True, min_count=1)
														
 
															+                cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
														
 
															+                data_sub = (
														
 
															+                    data_country[entities_current]
														
 
															+                    .pr.loc[{"category": cats_to_subtract}]
														
 
															+                    .pr.sum(dim="category", skipna=True, min_count=1)
														
 
															+                )
														
 
															                 data_parent = data_country[entities_current].pr.loc[
														
 
															-                    {'category': subtract_cats_current[cat_to_generate]['parent']}]
														
 
															+                    {"category": subtract_cats_current[cat_to_generate]["parent"]}
														
 
															+                ]
														
 
															                 data_agg = data_parent - data_sub
														
 
															-                nan_vars = [var for var in data_agg.data_vars if
														
 
															-                            data_agg[var].isnull().all().data is True]
														
 
															+                nan_vars = [
														
 
															+                    var
														
 
															+                    for var in data_agg.data_vars
														
 
															+                    if data_agg[var].isnull().all().data is True
														
 
															+                ]
														
 
															                 data_agg = data_agg.drop(nan_vars)
														
 
															                 if len(data_agg.data_vars) > 0:
														
 
															                     print(f"Generating {cat_to_generate} through subtraction")
														
 
															-                    data_agg = data_agg.expand_dims([f'category ('
														
 
															-                                                     f'{cat_terminology_in})'])
														
 
															+                    data_agg = data_agg.expand_dims(
														
 
															+                        [f"category (" f"{cat_terminology_in})"]
														
 
															+                    )
														
 
															                     data_agg = data_agg.assign_coords(
														
 
															-                        coords={f'category ({cat_terminology_in})':
														
 
															-                                    (f'category ({cat_terminology_in})',
														
 
															-                                     [cat_to_generate])})
														
 
															+                        coords={
														
 
															+                            f"category ({cat_terminology_in})": (
														
 
															+                                f"category ({cat_terminology_in})",
														
 
															+                                [cat_to_generate],
														
 
															+                            )
														
 
															+                        }
														
 
															+                    )
														
 
															                     if cat_name_present:
														
 
															-                        cat_name = subtract_cats_current[cat_to_generate]['name']
														
 
															+                        cat_name = subtract_cats_current[cat_to_generate]["name"]
														
 
															                         data_agg = data_agg.assign_coords(
														
 
															-                            coords={'orig_cat_name':
														
 
															-                                        (f'category ({cat_terminology_in})',
														
 
															-                                         [cat_name])})
														
 
															-                    data_country = data_country.pr.merge(data_agg,
														
 
															-                                                         tolerance=tolerance)
														
 
															+                            coords={
														
 
															+                                "orig_cat_name": (
														
 
															+                                    f"category ({cat_terminology_in})",
														
 
															+                                    [cat_name],
														
 
															+                                )
														
 
															+                            }
														
 
															+                        )
														
 
															+                    data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
														
 
															                 else:
														
 
															                     print(f"no data to generate category {cat_to_generate}")
														
 
															         # downscaling
														
 
															-        if 'downscale' in processing_info_country:
														
 
															-            if 'sectors' in processing_info_country['downscale']:
														
 
															-                sector_downscaling = \
														
 
															-                    processing_info_country['downscale']['sectors']
														
 
															+        if "downscale" in processing_info_country:
														
 
															+            if "sectors" in processing_info_country["downscale"]:
														
 
															+                sector_downscaling = processing_info_country["downscale"]["sectors"]
														
 
															                 for case in sector_downscaling.keys():
														
 
															                     print(f"Downscaling for {case}.")
														
 
															                     sector_downscaling_current = sector_downscaling[case]
														
 
															-                    entities = sector_downscaling_current.pop('entities')
														
 
															+                    entities = sector_downscaling_current.pop("entities")
														
 
															                     for entity in entities:
														
 
															                         data_country[entity] = data_country[
														
 
															-                            entity].pr.downscale_timeseries(
														
 
															-                            **sector_downscaling_current)
														
 
															+                            entity
														
 
															+                        ].pr.downscale_timeseries(**sector_downscaling_current)
														
 
															                         # , skipna_evaluation_dims=None)
														
 
															-            if 'entities' in processing_info_country['downscale']:
														
 
															-                entity_downscaling = \
														
 
															-                    processing_info_country['downscale']['entities']
														
 
															+            if "entities" in processing_info_country["downscale"]:
														
 
															+                entity_downscaling = processing_info_country["downscale"]["entities"]
														
 
															                 for case in entity_downscaling.keys():
														
 
															                     print(f"Downscaling for {case}.")
														
 
															                     # print(data_country.coords[f'category ('
														
 
															                     #                          f'{cat_terminology_in})'].values)
														
 
															                     data_country = data_country.pr.downscale_gas_timeseries(
														
 
															-                        **entity_downscaling[case], skipna=True,
														
 
															-                        skipna_evaluation_dims=None)
														
 
															+                        **entity_downscaling[case],
														
 
															+                        skipna=True,
														
 
															+                        skipna_evaluation_dims=None,
														
 
															+                    )
														
 
															         # aggregate categories
														
 
															-        if 'aggregate_cats' in processing_info_country:
														
 
															-            if 'agg_tolerance' in processing_info_country:
														
 
															-                agg_tolerance = processing_info_country['agg_tolerance']
														
 
															+        if "aggregate_cats" in processing_info_country:
														
 
															+            if "agg_tolerance" in processing_info_country:
														
 
															+                agg_tolerance = processing_info_country["agg_tolerance"]
														
 
															             else:
														
 
															                 agg_tolerance = tolerance
														
 
															-            aggregate_cats_current = processing_info_country['aggregate_cats']
														
 
															+            aggregate_cats_current = processing_info_country["aggregate_cats"]
														
 
															             print(
														
 
															                 f"Aggregating categories for country {country_code}, source {source}, "
														
 
															-                f"scenario {scenario}")
														
 
															+                f"scenario {scenario}"
														
 
															+            )
														
 
															             for cat_to_agg in aggregate_cats_current:
														
 
															                 print(f"Category: {cat_to_agg}")
														
 
															-                source_cats = aggregate_cats_current[cat_to_agg]['sources']
														
 
															-                data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
														
 
															-                    dim='category', skipna=True, min_count=1)
														
 
															-                nan_vars = [var for var in data_agg.data_vars if
														
 
															-                            data_agg[var].isnull().all().data is True]
														
 
															+                source_cats = aggregate_cats_current[cat_to_agg]["sources"]
														
 
															+                data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
														
 
															+                    dim="category", skipna=True, min_count=1
														
 
															+                )
														
 
															+                nan_vars = [
														
 
															+                    var
														
 
															+                    for var in data_agg.data_vars
														
 
															+                    if data_agg[var].isnull().all().data is True
														
 
															+                ]
														
 
															                 data_agg = data_agg.drop(nan_vars)
														
 
															                 if len(data_agg.data_vars) > 0:
														
 
															-                    data_agg = data_agg.expand_dims([f'category ('
														
 
															-                                                     f'{cat_terminology_in})'])
														
 
															+                    data_agg = data_agg.expand_dims(
														
 
															+                        [f"category (" f"{cat_terminology_in})"]
														
 
															+                    )
														
 
															                     data_agg = data_agg.assign_coords(
														
 
															-                        coords={f'category ({cat_terminology_in})':
														
 
															-                                    (f'category ({cat_terminology_in})',
														
 
															-                                     [cat_to_agg])})
														
 
															+                        coords={
														
 
															+                            f"category ({cat_terminology_in})": (
														
 
															+                                f"category ({cat_terminology_in})",
														
 
															+                                [cat_to_agg],
														
 
															+                            )
														
 
															+                        }
														
 
															+                    )
														
 
															                     if cat_name_present:
														
 
															-                        cat_name = aggregate_cats_current[cat_to_agg]['name']
														
 
															+                        cat_name = aggregate_cats_current[cat_to_agg]["name"]
														
 
															                         data_agg = data_agg.assign_coords(
														
 
															-                            coords={'orig_cat_name':
														
 
															-                                        (f'category ({cat_terminology_in})',
														
 
															-                                         [cat_name])})
														
 
															-                    data_country = data_country.pr.merge(data_agg,
														
 
															-                                                         tolerance=agg_tolerance)
														
 
															+                            coords={
														
 
															+                                "orig_cat_name": (
														
 
															+                                    f"category ({cat_terminology_in})",
														
 
															+                                    [cat_name],
														
 
															+                                )
														
 
															+                            }
														
 
															+                        )
														
 
															+                    data_country = data_country.pr.merge(
														
 
															+                        data_agg, tolerance=agg_tolerance
														
 
															+                    )
														
 
															                 else:
														
 
															                     print(f"no data to aggregate category {cat_to_agg}")
														
 
															         # copy HFCs and PFCs with default factors
														
 
															-        if 'basket_copy' in processing_info_country:
														
 
															+        if "basket_copy" in processing_info_country:
														
 
															             GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
														
 
															             entities = processing_info_country["basket_copy"]["entities"]
														
 
															             source_GWP = processing_info_country["basket_copy"]["source_GWP"]
														
 
															             for entity in entities:
														
 
															-                data_source = data_country[f'{entity} ({source_GWP})']
														
 
															+                data_source = data_country[f"{entity} ({source_GWP})"]
														
 
															                 for GWP in GWPs_to_add:
														
 
															-                    data_GWP = data_source * \
														
 
															-                               GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
														
 
															+                    data_GWP = (
														
 
															+                        data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
														
 
															+                    )
														
 
															                     data_GWP.attrs["entity"] = entity
														
 
															                     data_GWP.attrs["gwp_context"] = GWP
														
 
															                     data_country[f"{entity} ({GWP})"] = data_GWP
														
 
															         # aggregate gases if desired
														
 
															-        if 'aggregate_gases' in processing_info_country:
														
 
															+        if "aggregate_gases" in processing_info_country:
														
 
															             # TODO: why use different code here than below. Can this fill non-existen
														
 
															             #  gas baskets?
														
 
															-            for case in processing_info_country['aggregate_gases'].keys():
														
 
															-                case_info = processing_info_country['aggregate_gases'][case]
														
 
															-                data_country[case_info['basket']] = \
														
 
															-                    data_country.pr.fill_na_gas_basket_from_contents(
														
 
															-                        **case_info)
														
 
															+            for case in processing_info_country["aggregate_gases"].keys():
														
 
															+                case_info = processing_info_country["aggregate_gases"][case]
														
 
															+                data_country[
														
 
															+                    case_info["basket"]
														
 
															+                ] = data_country.pr.fill_na_gas_basket_from_contents(**case_info)
														
 
															     # 3: map categories
														
 
															     if category_conversion is not None:
														
@@ -270,61 +311,74 @@ def process_data_for_country(
 
															     # more general processing
														
 
															     # reduce categories to output cats
														
 
															     if sectors_out is not None:
														
 
															-        cats_to_keep = [cat for cat in
														
 
															-                        data_country.coords[f'category ({cat_terminology_out})'].values
														
 
															-                        if cat in sectors_out]
														
 
															-        data_country = data_country.pr.loc[{'category': cats_to_keep}]
														
 
															+        cats_to_keep = [
														
 
															+            cat
														
 
															+            for cat in data_country.coords[f"category ({cat_terminology_out})"].values
														
 
															+            if cat in sectors_out
														
 
															+        ]
														
 
															+        data_country = data_country.pr.loc[{"category": cats_to_keep}]
														
 
															     # create gas baskets
														
 
															     entities_present = set(data_country.data_vars)
														
 
															     for basket in gas_baskets.keys():
														
 
															-        basket_contents_present = [gas for gas in gas_baskets[basket] if
														
 
															-                                   gas in entities_present]
														
 
															+        basket_contents_present = [
														
 
															+            gas for gas in gas_baskets[basket] if gas in entities_present
														
 
															+        ]
														
 
															         if len(basket_contents_present) > 0:
														
 
															             if basket in list(data_country.data_vars):
														
 
															                 data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
														
 
															-                    basket=basket, basket_contents=basket_contents_present,
														
 
															-                    skipna=True, min_count=1)
														
 
															+                    basket=basket,
														
 
															+                    basket_contents=basket_contents_present,
														
 
															+                    skipna=True,
														
 
															+                    min_count=1,
														
 
															+                )
														
 
															             else:
														
 
															                 try:
														
 
															-                    #print(data_country.data_vars)
														
 
															-                    data_country[basket] = xr.full_like(data_country["CO2"],
														
 
															-                                                        np.nan).pr.quantify(
														
 
															-                        units="Gg CO2 / year")
														
 
															-                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
														
 
															-                                                  "gwp_context": basket.split(' ')[1][
														
 
															-                                                                 1:-1]}
														
 
															+                    # print(data_country.data_vars)
														
 
															+                    data_country[basket] = xr.full_like(
														
 
															+                        data_country["CO2"], np.nan
														
 
															+                    ).pr.quantify(units="Gg CO2 / year")
														
 
															+                    data_country[basket].attrs = {
														
 
															+                        "entity": basket.split(" ")[0],
														
 
															+                        "gwp_context": basket.split(" ")[1][1:-1],
														
 
															+                    }
														
 
															                     data_country[basket] = data_country.pr.gas_basket_contents_sum(
														
 
															-                        basket=basket, basket_contents=basket_contents_present,
														
 
															-                        min_count=1)
														
 
															+                        basket=basket,
														
 
															+                        basket_contents=basket_contents_present,
														
 
															+                        min_count=1,
														
 
															+                    )
														
 
															                     entities_present.add(basket)
														
 
															                 except Exception as ex:
														
 
															-                    print(f"No gas basket created for {country_code}, {source}, "
														
 
															-                          f"{scenario}: {ex}")
														
 
															+                    print(
														
 
															+                        f"No gas basket created for {country_code}, {source}, "
														
 
															+                        f"{scenario}: {ex}"
														
 
															+                    )
														
 
															     # amend title and comment
														
 
															-    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
														
 
															-                                                                    f"{date.today()}"
														
 
															-    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
														
 
															-                                                                    f"{date.today()}"
														
 
															+    data_country.attrs["comment"] = (
														
 
															+        data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
														
 
															+    )
														
 
															+    data_country.attrs["title"] = (
														
 
															+        data_country.attrs["title"] + f" Processed on " f"{date.today()}"
														
 
															+    )
														
 
															     return data_country
														
 
															 def convert_categories(
														
 
															-        ds_input: xr.Dataset,
														
 
															-        conversion: Dict[str, Dict[str, str]],
														
 
															-        #terminology_from: str,
														
 
															-        terminology_to: str,
														
 
															-        debug: bool=False,
														
 
															-        tolerance: float=0.01,
														
 
															-)->xr.Dataset:
														
 
															+    ds_input: xr.Dataset,
														
 
															+    conversion: Dict[str, Dict[str, str]],
														
 
															+    # terminology_from: str,
														
 
															+    terminology_to: str,
														
 
															+    debug: bool = False,
														
 
															+    tolerance: float = 0.01,
														
 
															+) -> xr.Dataset:
														
 
															     """
														
 
															     convert data from one category terminology to another
														
 
															     """
														
 
															     print(f"converting categories to {terminology_to}")
														
 
															-    if 'orig_cat_name' in ds_input.coords:
														
 
															+    if "orig_cat_name" in ds_input.coords:
														
 
															         cat_name_present = True
														
 
															     else:
														
 
															         cat_name_present = False
														
@@ -338,50 +392,67 @@ def convert_categories(
 
															     ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
														
 
															     # find categories present in dataset
														
 
															-    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
														
 
															+    cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
														
 
															     # restrict categories and map category names
														
 
															-    if 'mapping' in conversion.keys():
														
 
															-        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
														
 
															-                                cat in cats_present]
														
 
															-        ds_converted = ds_converted.pr.loc[
														
 
															-            {'category': mapping_cats_present}]
														
 
															-
														
 
															-        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
														
 
															-        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
														
 
															-        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
														
 
															-                                                   (f'category ({terminology_to})',
														
 
															-                                                    to_cats)})
														
 
															+    if "mapping" in conversion.keys():
														
 
															+        mapping_cats_present = [
														
 
															+            cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
														
 
															+        ]
														
 
															+        ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
														
 
															+
														
 
															+        from_cats = ds_converted.coords[f"category ({terminology_to})"].values
														
 
															+        to_cats = pd.Series(from_cats).replace(conversion["mapping"])
														
 
															+        ds_converted = ds_converted.assign_coords(
														
 
															+            {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
														
 
															+        )
														
 
															     # redo the list of present cats after mapping, as we have new categories in the
														
 
															     # target terminology now
														
 
															-    cats_present_mapped = list(ds_converted.coords[f'category ('
														
 
															-                                                   f'{terminology_to})'].values)
														
 
															+    cats_present_mapped = list(
														
 
															+        ds_converted.coords[f"category (" f"{terminology_to})"].values
														
 
															+    )
														
 
															     # aggregate categories
														
 
															-    if 'aggregate' in conversion:
														
 
															-        aggregate_cats = conversion['aggregate']
														
 
															+    if "aggregate" in conversion:
														
 
															+        aggregate_cats = conversion["aggregate"]
														
 
															         for cat_to_agg in aggregate_cats:
														
 
															             if debug:
														
 
															                 print(f"Category: {cat_to_agg}")
														
 
															-            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
														
 
															-                           cat in cats_present_mapped]
														
 
															+            source_cats = [
														
 
															+                cat
														
 
															+                for cat in aggregate_cats[cat_to_agg]["sources"]
														
 
															+                if cat in cats_present_mapped
														
 
															+            ]
														
 
															             if debug:
														
 
															                 print(source_cats)
														
 
															-            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
														
 
															-                dim='category', skipna=True, min_count=1)
														
 
															-            nan_vars = [var for var in data_agg.data_vars if
														
 
															-                        data_agg[var].isnull().all().data == True]
														
 
															+            data_agg = ds_converted.pr.loc[{"category": source_cats}].pr.sum(
														
 
															+                dim="category", skipna=True, min_count=1
														
 
															+            )
														
 
															+            nan_vars = [
														
 
															+                var
														
 
															+                for var in data_agg.data_vars
														
 
															+                if data_agg[var].isnull().all().data == True
														
 
															+            ]
														
 
															             data_agg = data_agg.drop(nan_vars)
														
 
															             if len(data_agg.data_vars) > 0:
														
 
															-                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
														
 
															+                data_agg = data_agg.expand_dims([f"category ({terminology_to})"])
														
 
															                 data_agg = data_agg.assign_coords(
														
 
															-                    coords={f'category ({terminology_to})':
														
 
															-                                (f'category ({terminology_to})', [cat_to_agg])})
														
 
															+                    coords={
														
 
															+                        f"category ({terminology_to})": (
														
 
															+                            f"category ({terminology_to})",
														
 
															+                            [cat_to_agg],
														
 
															+                        )
														
 
															+                    }
														
 
															+                )
														
 
															                 if cat_name_present:
														
 
															                     data_agg = data_agg.assign_coords(
														
 
															-                        coords={'orig_cat_name':
														
 
															-                                    (f'category ({terminology_to})',
														
 
															-                                     [aggregate_cats[cat_to_agg]['name']])})
														
 
															+                        coords={
														
 
															+                            "orig_cat_name": (
														
 
															+                                f"category ({terminology_to})",
														
 
															+                                [aggregate_cats[cat_to_agg]["name"]],
														
 
															+                            )
														
 
															+                        }
														
 
															+                    )
														
 
															                 ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
														
 
															                 cats_present_mapped.append(cat_to_agg)
														
 
															             else:
														
@@ -391,9 +462,9 @@ def convert_categories(
 
															 def get_country_name(
														
 
															-        country_code: str,
														
 
															+    country_code: str,
														
 
															 ) -> str:
														
 
															-    """get country name from code """
														
 
															+    """get country name from code"""
														
 
															     if country_code in custom_country_mapping:
														
 
															         country_name = custom_country_mapping[country_code]
														
 
															     else:
														
@@ -401,15 +472,16 @@ def get_country_name(
 
															             country = pycountry.countries.get(alpha_3=country_code)
														
 
															             country_name = country.name
														
 
															         except:
														
 
															-            raise ValueError(f"Country code {country_code} can not be mapped to "
														
 
															-                             f"any country")
														
 
															+            raise ValueError(
														
 
															+                f"Country code {country_code} can not be mapped to " f"any country"
														
 
															+            )
														
 
															     return country_name
														
 
															 def get_country_code(
														
 
															-        country_name: str,
														
 
															-)->str:
														
 
															+    country_name: str,
														
 
															+) -> str:
														
 
															     """
														
 
															     obtain country code. If the input is a code it will be returned,
														
 
															     if the input
														
@@ -435,28 +507,31 @@ def get_country_code(
 
															             country_code = country.alpha_3
														
 
															         except:
														
 
															             try:
														
 
															-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
														
 
															+                country = pycountry.countries.search_fuzzy(
														
 
															+                    country_name.replace("_", " ")
														
 
															+                )
														
 
															             except:
														
 
															-                raise ValueError(f"Country name {country_name} can not be mapped to "
														
 
															-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
														
 
															+                raise ValueError(
														
 
															+                    f"Country name {country_name} can not be mapped to "
														
 
															+                    f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly."
														
 
															+                )
														
 
															             if len(country) > 1:
														
 
															                 country_code = None
														
 
															                 for current_country in country:
														
 
															                     if current_country.name == country_name:
														
 
															                         country_code = current_country.alpha_3
														
 
															                 if country_code is None:
														
 
															-                    raise ValueError(f"Country name {country_name} has {len(country)} "
														
 
															-                                     f"possible results for country codes.")
														
 
															+                    raise ValueError(
														
 
															+                        f"Country name {country_name} has {len(country)} "
														
 
															+                        f"possible results for country codes."
														
 
															+                    )
														
 
															             country_code = country[0].alpha_3
														
 
															     return country_code
														
 
															-def create_folder_mapping(
														
 
															-        folder: str,
														
 
															-        extracted: bool = False
														
 
															-) -> None:
														
 
															+def create_folder_mapping(folder: str, extracted: bool = False) -> None:
														
 
															     """
														
 
															     Create a mapping from 3 letter ISO country codes to folders
														
 
															     based on the subfolders of the given folder. The mapping is
														
@@ -480,9 +555,9 @@ def create_folder_mapping(
 
															     folder = root_path / folder
														
 
															     folder_mapping = {}
														
 
															-    #if not extracted:
														
 
															+    # if not extracted:
														
 
															     known_folders = custom_folders
														
 
															-    #else:
														
 
															+    # else:
														
 
															     #    known_folders = {}
														
 
															     for item in folder.iterdir():
														
@@ -491,7 +566,9 @@ def create_folder_mapping(
 
															                 ISO3 = known_folders[item.name]
														
 
															             else:
														
 
															                 try:
														
 
															-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
														
 
															+                    country = pycountry.countries.search_fuzzy(
														
 
															+                        item.name.replace("_", " ")
														
 
															+                    )
														
 
															                     if len(country) > 1:
														
 
															                         ISO3 = None
														
 
															                         for current_country in country:
														
@@ -516,8 +593,8 @@ def create_folder_mapping(
 
															 # TODO add crf
														
 
															 def get_country_submissions(
														
 
															-        country_name: str,
														
 
															-        print_sub: bool = True,
														
 
															+    country_name: str,
														
 
															+    print_sub: bool = True,
														
 
															 ) -> Dict[str, List[str]]:
														
 
															     """
														
 
															     Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
														
@@ -585,8 +662,8 @@ def get_country_submissions(
 
															 def get_country_datasets(
														
 
															-        country_name: str,
														
 
															-        print_ds: bool = True,
														
 
															+    country_name: str,
														
 
															+    print_ds: bool = True,
														
 
															 ) -> Dict[str, List[str]]:
														
 
															     """
														
 
															     Input is a three letter ISO code for a country, or the country's name.
														
@@ -638,35 +715,42 @@ def get_country_datasets(
 
															             else:
														
 
															                 country_folder = folder_mapping[country_code]
														
 
															                 if not isinstance(country_folder, str):
														
 
															-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															+                    raise ValueError(
														
 
															+                        "Wrong data type in folder mapping json file. Should be str."
														
 
															+                    )
														
 
															                 datasets_current_folder = {}
														
 
															                 current_folder = item / country_folder
														
 
															                 for data_file in current_folder.iterdir():
														
 
															-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
														
 
															                         if data_file.stem in datasets_current_folder:
														
 
															-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															+                            datasets_current_folder[data_file.stem].append(
														
 
															+                                data_file.suffix
														
 
															+                            )
														
 
															                         else:
														
 
															                             datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															                 for dataset in datasets_current_folder:
														
 
															                     # process filename to get submission
														
 
															-                    parts = dataset.split('_')
														
 
															+                    parts = dataset.split("_")
														
 
															                     if parts[0] != country_code:
														
 
															-                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
														
 
															-                            dataset
														
 
															+                        cleaned_datasets_current_folder[
														
 
															+                            f"Wrong code: {parts[0]}"
														
 
															+                        ] = dataset
														
 
															                     else:
														
 
															-                        terminology = "_".join(parts[3 : ])
														
 
															+                        terminology = "_".join(parts[3:])
														
 
															                         key = f"{parts[1]} ({parts[2]}, {terminology})"
														
 
															                         data_info = ""
														
 
															-                        if '.nc' in datasets_current_folder[dataset]:
														
 
															+                        if ".nc" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "NF (.nc), "
														
 
															-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															+                        if (".csv" in datasets_current_folder[dataset]) and (
														
 
															+                            ".yaml" in datasets_current_folder[dataset]
														
 
															+                        ):
														
 
															                             data_info = data_info + "IF (.yaml + .csv), "
														
 
															-                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															+                        elif ".csv" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "incomplete IF? (.csv), "
														
 
															-                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															+                        elif ".yaml" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "incomplete IF (.yaml), "
														
 
															                         code_file = get_code_file(country_code, parts[1])
														
@@ -680,7 +764,9 @@ def get_country_datasets(
 
															                 if print_ds:
														
 
															                     if cleaned_datasets_current_folder:
														
 
															                         for country_ds in cleaned_datasets_current_folder:
														
 
															-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															+                            print(
														
 
															+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
														
 
															+                            )
														
 
															                     else:
														
 
															                         print("No data available")
														
 
															                     print("")
														
@@ -708,34 +794,42 @@ def get_country_datasets(
 
															             else:
														
 
															                 country_folder = folder_mapping[country_code]
														
 
															                 if not isinstance(country_folder, str):
														
 
															-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															+                    raise ValueError(
														
 
															+                        "Wrong data type in folder mapping json file. Should be str."
														
 
															+                    )
														
 
															                 datasets_current_folder = {}
														
 
															                 current_folder = item / country_folder
														
 
															                 for data_file in current_folder.iterdir():
														
 
															-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															+                    if data_file.suffix in [".nc", ".yaml", ".csv"]:
														
 
															                         if data_file.stem in datasets_current_folder:
														
 
															-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															+                            datasets_current_folder[data_file.stem].append(
														
 
															+                                data_file.suffix
														
 
															+                            )
														
 
															                         else:
														
 
															                             datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															                 for dataset in datasets_current_folder:
														
 
															                     # process filename to get submission
														
 
															-                    parts = dataset.split('_')
														
 
															+                    parts = dataset.split("_")
														
 
															                     if parts[0] != country_code:
														
 
															-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
														
 
															+                        cleaned_datasets_current_folder[
														
 
															+                            f"Wrong UNFCCC_GHG_data: {parts[0]}"
														
 
															+                        ] = dataset
														
 
															                     else:
														
 
															-                        terminology = "_".join(parts[3 : ])
														
 
															+                        terminology = "_".join(parts[3:])
														
 
															                         key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
														
 
															                         data_info = ""
														
 
															-                        if '.nc' in datasets_current_folder[dataset]:
														
 
															+                        if ".nc" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "NF (.nc), "
														
 
															-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															+                        if (".csv" in datasets_current_folder[dataset]) and (
														
 
															+                            ".yaml" in datasets_current_folder[dataset]
														
 
															+                        ):
														
 
															                             data_info = data_info + "IF (.yaml + .csv), "
														
 
															-                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															+                        elif ".csv" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "incomplete IF? (.csv), "
														
 
															-                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															+                        elif ".yaml" in datasets_current_folder[dataset]:
														
 
															                             data_info = data_info + "incomplete IF (.yaml), "
														
 
															                         cleaned_datasets_current_folder[key] = data_info
														
@@ -743,7 +837,9 @@ def get_country_datasets(
 
															                 if print_ds:
														
 
															                     if cleaned_datasets_current_folder:
														
 
															                         for country_ds in cleaned_datasets_current_folder:
														
 
															-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															+                            print(
														
 
															+                                f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
														
 
															+                            )
														
 
															                     else:
														
 
															                         print("No data available")
														
 
															                     print("")
														
@@ -759,9 +855,9 @@ def get_country_datasets(
 
															 def get_code_file(
														
 
															-        country_name: str,
														
 
															-        submission: str,
														
 
															-        print_info: bool = False,
														
 
															+    country_name: str,
														
 
															+    submission: str,
														
 
															+    print_info: bool = False,
														
 
															 ) -> Path:
														
 
															     """
														
 
															     For given country name and submission find the script that creates the data
														
@@ -813,13 +909,17 @@ def get_code_file(
 
															         for file in country_folder.iterdir():
														
 
															             if file.match(code_file_name_candidate):
														
 
															                 if code_file_path is not None:
														
 
															-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
														
 
															-                                     f"{code_file_path} and file.name. "
														
 
															-                                     f"Please use only one file with name "
														
 
															-                                     f"'read_ISO3_submission_XXX.YYY'.")
														
 
															+                    raise ValueError(
														
 
															+                        f"Found multiple UNFCCC_GHG_data candidates: "
														
 
															+                        f"{code_file_path} and file.name. "
														
 
															+                        f"Please use only one file with name "
														
 
															+                        f"'read_ISO3_submission_XXX.YYY'."
														
 
															+                    )
														
 
															                 else:
														
 
															                     if print_info:
														
 
															-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
														
 
															+                        print(
														
 
															+                            f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}"
														
 
															+                        )
														
 
															                 code_file_path = file
														
 
															     if code_file_path is not None:
														
@@ -828,8 +928,10 @@ def get_code_file(
 
															         return None
														
 
															-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
														
 
															-    '''
														
 
															+def fix_rows(
														
 
															+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
														
 
															+) -> pd.DataFrame:
														
 
															+    """
														
 
															     Function to fix rows that have been split during reading from pdf
														
 
															     This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
														
@@ -838,18 +940,20 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
															     :param col_to_use:
														
 
															     :param n_rows:
														
 
															     :return:
														
 
															-    '''
														
 
															+    """
														
 
															     for row in rows_to_fix:
														
 
															-        #print(row)
														
 
															+        # print(row)
														
 
															         # find the row number and collect the row and the next two rows
														
 
															         index = data.loc[data[col_to_use] == row].index
														
 
															-        #print(list(index))
														
 
															+        # print(list(index))
														
 
															         if not list(index):
														
 
															             print(f"Can't merge split row {row}")
														
 
															             print(data[col_to_use])
														
 
															-        #print(f"Merging split row {row} for table {page}")
														
 
															+        # print(f"Merging split row {row} for table {page}")
														
 
															         loc = data.index.get_loc(index[0])
														
 
															-        if n_rows == -3:
														
 
															+        if n_rows == -2:
														
 
															+            locs_to_merge = list(range(loc - 1, loc + 1))
														
 
															+        elif n_rows == -3:
														
 
															             locs_to_merge = list(range(loc - 1, loc + 2))
														
 
															         elif n_rows == -5:
														
 
															             locs_to_merge = list(range(loc - 1, loc + 4))
														
@@ -858,7 +962,7 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
															         rows_to_merge = data.iloc[locs_to_merge]
														
 
															         indices_to_merge = rows_to_merge.index
														
 
															         # join the three rows
														
 
															-        new_row = rows_to_merge.agg(' '.join)
														
 
															+        new_row = rows_to_merge.agg(" ".join)
														
 
															         # replace the double spaces that are created
														
 
															         # must be done here and not at the end as splits are not always
														
 
															         # the same and join would produce different col values
														
@@ -866,6 +970,10 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
 
															         new_row = new_row.str.replace("N O", "NO")
														
 
															         new_row = new_row.str.replace(", N", ",N")
														
 
															         new_row = new_row.str.replace("- ", "-")
														
 
															+        # replace spaces in numbers
														
 
															+        pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
														
 
															+        repl = lambda m: f"{m.group('first')}{m.group('last')}"
														
 
															+        new_row = new_row.str.replace(pat, repl, regex=True)
														
 
															         data.loc[indices_to_merge[0]] = new_row
														
 
															         data = data.drop(indices_to_merge[1:])
														
 
															-    return data
														
 
															+    return data
														
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,6 @@ requires = [
 
															 ]
														
 
															 build-backend = "setuptools.build_meta"
														
 
															+[tool.black]
														
 
															+line-length = 88
														
 
															+
														
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,7 +30,7 @@ packages =
 
															     UNFCCC_GHG_data.UNFCCC_downloader
														
 
															     UNFCCC_GHG_data.UNFCCC_DI_reader
														
 
															     UNFCCC_GHG_data.helper
														
 
															-    #UNFCCC_GHG_data.datasets
														
 
															+#UNFCCC_GHG_data.datasets
														
 
															 python_requires = >=3.8
														
 
															 setup_requires =
														
 
															     setuptools_scm
														
@@ -70,6 +70,7 @@ dev =
 
															     jupyter
														
 
															     dask
														
 
															     ipympl
														
 
															+    black
														
 
															 [options.package_data]