@@ -4,8 +4,11 @@
import camelot
import primap2 as pm2
import pandas as pd
-import numpy as np
+import copy
from pathlib import Path
+from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
+ table_defs, header_defs
+from primap2.pm2io._data_reading import matches_time_format, filter_data
# ###
# configuration
@@ -15,12 +18,8 @@ root_path = root_path.resolve()
downloaded_data_path = root_path / "downloaded_data"
extracted_data_path = root_path / "extracted_data"
input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
-if not output_folder.exists():
- output_folder.mkdir()
output_filename = 'MAR_BUR3_2022_'
inventory_file = 'Morocco_BUR3_Fr.pdf'
@@ -33,79 +32,29 @@ pages_to_read = range(104, 138)
compression = dict(zlib=True, complevel=9)
-header_defs = {
- 'Energy': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
- ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
- 'Agriculture': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
- ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
- 'IPPU': [['Catégories', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'COVNM', 'SO2'],
- ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
- 'LULUCF': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
- ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
- 'Waste': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
- ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-# define which raw tables to combine
-table_defs = {
- 2010: {
- 'Energy': [0, 1],
- 'Agriculture': [10],
- 'IPPU': [15, 16, 17],
- 'LULUCF': [30],
- 'Waste': [35],
- },
- 2012: {
- 'Energy': [2, 3],
- 'Agriculture': [11],
- 'IPPU': [18, 19, 20],
- 'LULUCF': [31],
- 'Waste': [36],
- },
- 2014: {
- 'Energy': [4, 5],
- 'Agriculture': [10],
- 'IPPU': [21, 22, 23],
- 'LULUCF': [32],
- 'Waste': [37],
- },
- 2016: {
- 'Energy': [6, 7],
- 'Agriculture': [10],
- 'IPPU': [24, 25, 26],
- 'LULUCF': [33],
- 'Waste': [38],
- },
- 2018: {
- 'Energy': [8, 9],
- 'Agriculture': [14],
- 'IPPU': [27, 28, 29],
- 'LULUCF': [34],
- 'Waste': [39],
- },
# special header as category code and name in one column
header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
index_cols = ['Catégories']
# rows to remove
-cats_remove = []
+cats_remove = [
+ 'Agriculture' # always empty
# manual category codes
cat_codes_manual = {
'1.A.2.e -Industries agro-alimentaires et du tabac': '1.A.2.e',
'1.A.2.f -Industries des minéraux non- métalliques': '1.A.2.f',
- 'Agriculture': 'M.AG',
+ #'Agriculture': 'M.AG',
'2. PIUP': '2',
- '3.B.1 Terres forestières': '3.B.1',
- '3.B.2 Terres cultivées': '3.B.2',
- '3.B.3 Prairies': '3.B.3',
- '3.B.4 Terres humides': '3.B.4',
- '3.B.5 Etablissements': '3.B.5',
- '3.B.6 Autres terres': '3.B.6',
+ '3.B.1 Terres forestières': 'LU.3.B.1',
+ '3.B.2 Terres cultivées': 'LU.3.B.2',
+ '3.B.3 Prairies': 'LU.3.B.3',
+ '3.B.4 Terres humides': 'LU.3.B.4',
+ '3.B.5 Etablissements': 'LU.3.B.5',
+ '3.B.6 Autres terres': 'LU.3.B.6',
'1.B.1.a.i.1 -Exploitation minière': '1.A.1.a.i.1',
@@ -113,7 +62,7 @@ cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*'
coords_terminologies = {
"area": "ISO3",
- "category": "IPCC2006_PRIMAP",
+ "category": "IPCC1996_2006_MAR_Inv",
"scenario": "PRIMAP",
@@ -140,9 +89,9 @@ coords_cols = {
"unit": "unit"
-add_coords_cols = {
- "orig_cat_name": ["orig_cat_name", "category"],
+#add_coords_cols = {
+# "orig_cat_name": ["orig_cat_name", "category"],
filter_remove = {
"f1": {
@@ -184,6 +133,13 @@ for year in table_defs.keys():
df_this_table = df_this_table.drop(df_this_table.iloc[0:2].index)
df_this_table.columns = header_defs[sector]
+ # fix 2018 agri table
+ if (year == 2018) & (sector == "Agriculture"):
+ last_shift_row = 25
+ df_temp = df_this_table.iloc[0: last_shift_row, 1:].copy()
+ df_this_table.iloc[0, 1:] = ''
+ df_this_table.iloc[1: last_shift_row + 1, 1:] = df_temp
# replace line breaks, long hyphens, double, and triple spaces in category names
df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(" ", " ")
@@ -221,7 +177,7 @@ df_all = df_all.reset_index(drop=True)
# prepare numbers for pd.to_numeric
df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ', '')
-repl = lambda m: m.group('part1') + m.group('part2')
+repl = lambda m: m.group('part1') + '.' + m.group('part2')
df_all.loc[:, 'data'] = df_all.loc[:, 'data'].str.replace(
'(?P<part1>[0-9]+),(?P<part2>[0-9\.]+)$', repl, regex=True)
df_all['data'][df_all['data'].isnull()] = 'NaN'
@@ -231,6 +187,9 @@ for entity in df_all["entity"].unique():
df_all["entity"][(df_all["entity"] == entity) & (
df_all["unit"] == "GgCO2eq")] = f"{entity} ({gwp_to_use})"
+# drop "original_cat_name" as it has non-unique values per category
+df_all = df_all.drop(columns="orig_cat_name")
data_if = pm2.pm2io.convert_long_dataframe_if(
@@ -265,9 +224,94 @@ data_pm2 = data_pm2.drop_vars(entities_to_convert)
# convert back to IF to have units in the fixed format
data_if = data_pm2.pr.to_interchange_format()
-##### save data to IF and native format ####
+# ###
+# convert to IPCC2006 categories
+# ###
+data_if_2006 = copy.deepcopy(data_if)
+data_if_2006.attrs = copy.deepcopy(data_if.attrs)
+filter_remove_cats = {
+ "cat": {
+ f"category ({coords_terminologies['category']})":
+ remove_cats
+ },
+filter_data(data_if_2006, filter_remove=filter_remove_cats)
+# map categories
+data_if_2006 = data_if_2006.replace(
+ {f"category ({coords_terminologies['category']})": cat_mapping})
+data_if_2006[f"category ({coords_terminologies['category']})"].unique()
+# rename the category col
+ f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
+ inplace=True)
+data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
+data_if_2006.attrs['dimensions']['*'] = [
+ 'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
+ else item for item in data_if_2006.attrs['dimensions']['*']]
+# aggregate categories
+time_format = '%Y'
+time_columns = [
+ col
+ for col in data_if_2006.columns.values
+ if matches_time_format(col, time_format)
+for cat_to_agg in aggregate_cats:
+ mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
+ aggregate_cats[cat_to_agg]["sources"])
+ df_test = data_if_2006[mask]
+ # print(df_test)
+ if len(df_test) > 0:
+ print(f"Aggregating category {cat_to_agg}")
+ df_combine = df_test.copy(deep=True)
+ for col in time_columns:
+ df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
+ df_combine = df_combine.groupby(
+ by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
+ 'unit']).sum(min_count=1)
+ df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
+ # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+ # df_combine.insert(2, "orig_cat_name", "computed")
+ df_combine = df_combine.reset_index()
+ data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+ data_if_2006 = data_if_2006.reset_index(drop=True)
+ else:
+ print(f"no data to aggregate category {cat_to_agg}")
+for cat in zero_cats:
+ entities = data_if_2006["entity"].unique()
+ data_zero = data_if_2006[data_if_2006["category (IPCC2006_PRIMAP)"]=="1"].copy(
+ deep=True)
+ data_zero["category (IPCC2006_PRIMAP)"] = cat
+ for col in time_columns:
+ data_zero[col] = 0
+ data_if_2006 = pd.concat([data_if_2006, data_zero])
+# conversion to PRIMAP2 native format
+data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
+# convert back to IF to have units in the fixed format
+data_if_2006 = data_pm2_2006.pr.to_interchange_format()
+# ###
+# save data to IF and native format
+# ###
if not output_folder.exists():
+# data in original categories
output_folder / (output_filename + coords_terminologies["category"]), data_if)
@@ -276,3 +320,10 @@ data_pm2.pr.to_netcdf(
output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+# data in 2006 categories
+ output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+encoding = {var: compression for var in data_pm2_2006.data_vars}
+ output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)