Browse Source

guiniea notebook

Daniel Busch 1 year ago
parent
commit
3c9380e769

+ 2570 - 0
UNFCCC_GHG_data/UNFCCC_reader/Guinea/.ipynb_checkpoints/Guinea_BUR1_test_v3-checkpoint.ipynb

@@ -0,0 +1,2570 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a8f3f028-ef62-4014-b911-7a61d24e3dae",
+   "metadata": {},
+   "source": [
+    "### ToDos\n",
+    "- check if unit row lenght is correct"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "461e34a0-47b1-44a7-ba1a-77db66ea783a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set environment variable (only for jupyter notebook)\n",
+    "import os\n",
+    "os.environ[\"UNFCCC_GHG_ROOT_PATH\"] = \"/Users/danielbusch/Documents/UNFCCC_non-AnnexI_data\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "83dd87db-4956-4bb1-937a-84629bfce95b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import camelot\n",
+    "import primap2 as pm2\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c37d6d49-076c-4823-a486-83fbda3fa33f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ###\n",
+    "# configuration\n",
+    "# ###\n",
+    "\n",
+    "input_folder = downloaded_data_path / 'UNFCCC' / 'Guinea' / 'BUR1'\n",
+    "output_folder = extracted_data_path / 'UNFCCC' / 'Guinea'\n",
+    "if not output_folder.exists():\n",
+    "    output_folder.mkdir()\n",
+    "\n",
+    "pdf_file = \"Rapport_IGES-Guinee-BUR1_VF.pdf\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "87bf46ce-441e-4247-b62a-ce5ebcf26cb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# primap2 format conversion\n",
+    "coords_cols = {\n",
+    "    \"category\": \"category\",\n",
+    "    \"entity\": \"entity\",\n",
+    "    \"unit\": \"unit\",\n",
+    "}\n",
+    "\n",
+    "coords_defaults = {\n",
+    "    \"source\": \"GIN-GHG-Inventory\",\n",
+    "    \"provenance\": \"measured\",\n",
+    "    \"area\": \"GIN\",\n",
+    "    \"scenario\": \"BUR1\",\n",
+    "}\n",
+    "\n",
+    "coords_terminologies = {\n",
+    "    \"area\": \"ISO3\",\n",
+    "    # TODO check if this is correct\n",
+    "    \"category\": \"IPCC1996_2006_GIN_Inv\",\n",
+    "    \"scenario\": \"PRIMAP\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23676d59-d7e9-455c-b713-7ce98b92d5d7",
+   "metadata": {},
+   "source": [
+    "### Q: How to choose gwp_to_use?\n",
+    "### Q: 'unit' and 'category' are 'PRIMAP1'. Are there other options?\n",
+    "### Q: Why are we mapping 'NMVOCs': 'NMVOC', wouldn't it be easier to name it NMVOC in the first place?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "953ddab6-07ee-4b60-82f0-f2e9ca76b1a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Are we choosing this gwp\n",
+    "gwp_to_use = \"AR4GWP100\"\n",
+    "coords_value_mapping = {\n",
+    "    'main' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'HFCs': f\"HFCS ({gwp_to_use})\",\n",
+    "        'PFCs': f\"PFCS ({gwp_to_use})\",\n",
+    "        'SF6' : f\"SF6 ({gwp_to_use})\",\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'energy' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'lulucf' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'waste' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'trend' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    },\n",
+    "    \n",
+    "}\n",
+    "\n",
+    "\n",
+    "filter_remove = {\n",
+    "    'f_memo': {\"category\": \"MEMO\"},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef888811-5803-4df7-8fd8-06830e6d9bce",
+   "metadata": {},
+   "source": [
+    "### Q: What to put under references and rights?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "23b39c1a-700c-46f9-a3f5-33549658ad69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "meta_data = {\n",
+    "    \"references\": \"placeholder\",\n",
+    "    \"rights\": \"\",\n",
+    "    \"contact\": \"mail@johannes-guetschow.de\",\n",
+    "    \"title\": \"Guinea. Biennial update report (BUR). BUR1\",\n",
+    "    \"comment\": \"Read fom pdf by Daniel Busch\",\n",
+    "    \"institution\": \"UNFCCC\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2390fb91-d976-47f9-9236-a6c838e1fd56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page_def_templates = {\n",
+    "    '110': {\n",
+    "        \"area\": ['36,718,589,87'],\n",
+    "        \"cols\": ['290,340,368,392,425,445,465,497,535,564'],\n",
+    "    },\n",
+    "    '111': {\n",
+    "        \"area\": ['36,736,587,107'],\n",
+    "        \"cols\": ['293,335,369,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '112': {\n",
+    "        \"area\": ['35,733,588,106'],\n",
+    "        \"cols\": ['293,335,369,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '113': {\n",
+    "        \"area\": ['35,733,588,106'],\n",
+    "        \"cols\": ['293,335,365,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '131' : {\n",
+    "                \"area\": ['36,718,590,83'],\n",
+    "                \"cols\": ['293,332,370,406,442,480,516,554'],\n",
+    "            },\n",
+    "}\n",
+    "\n",
+    "# for main table\n",
+    "header_inventory = ['Greenhouse gas source and sink categories',\n",
+    "                   'CO2', 'CH4', \"N2O\", 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'NMVOCs','SO2'\n",
+    "                   ]\n",
+    "# TODO the extra '-' may be wrong here, check again!\n",
+    "unit_inventory = ['-'] + ['Gg'] * len(header_inventory) # one extra for the category columns\n",
+    "unit_inventory[4] = \"GgCO2eq\"\n",
+    "unit_inventory[5] = \"GgCO2eq\"\n",
+    "unit_inventory[6] = \"GgCO2eq\"\n",
+    "\n",
+    "# for energy tables\n",
+    "header_energy = ['Greenhouse gas source and sink categories',\n",
+    "                   'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs','SO2'\n",
+    "                   ]\n",
+    "unit_energy = ['-'] + ['Gg'] * len(header_energy) # one extra for the category columns\n",
+    "\n",
+    "# for lulucf tables\n",
+    "header_lulucf = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs']\n",
+    "unit_lulucf = ['-'] + ['Gg'] * (len(header_lulucf) - 1)\n",
+    "\n",
+    "# for waste table\n",
+    "header_waste = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs', 'SO2']\n",
+    "unit_waste = ['-'] + ['Gg'] * (len(header_waste) - 1)\n",
+    "\n",
+    "# for trend table (unit is always Gg for this table)\n",
+    "header_trend = ['data1990', 'data1995', \"data2000\", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']\n",
+    "\n",
+    "\n",
+    "# define config dict\n",
+    "inv_conf = {\n",
+    "    'header': header_inventory,\n",
+    "    'unit': unit_inventory,\n",
+    "    'header_energy' : header_energy,\n",
+    "    'unit_energy' : unit_energy,\n",
+    "    'header_lulucf' : header_lulucf,\n",
+    "    'unit_lulucf' : unit_lulucf,\n",
+    "    'header_waste' : header_waste,\n",
+    "    'unit_waste' : unit_waste,\n",
+    "    'header_trend' : header_trend,\n",
+    "    'entity_row': 0,\n",
+    "    'unit_row': 1,\n",
+    "    'index_cols': \"Greenhouse gas source and sink categories\",\n",
+    "    'year': {'110' : 1990,\n",
+    "             '111' : 2000,\n",
+    "             '112' : 2010,\n",
+    "             '113' : 2019,\n",
+    "             '116' : 1990,\n",
+    "             '117' : 2000,\n",
+    "             '118' : 2010,\n",
+    "             '119' : 2019,\n",
+    "             '124' : 1990,\n",
+    "             '125' : 2000,\n",
+    "             '126' : 2010,\n",
+    "             '127' : 2019,\n",
+    "            },\n",
+    "    'header_long': [\"orig_cat_name\", \"entity\", \"unit\", \"time\", \"data\"],\n",
+    "    \"cat_code_regexp\" : r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd0b97f8-acbb-4df1-9764-b2d0f6af39ba",
+   "metadata": {},
+   "source": [
+    "## 1. Read main tables - pages 110, 111, 112, 113"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4357ddd0-e9ee-4b2b-a765-c36411df63e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 110.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 111.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 112.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 113.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pages = ['110', '111', '112', '113']\n",
+    "df_all_dict = {}\n",
+    "for page in pages:\n",
+    "    \n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "        str(input_folder / pdf_file),\n",
+    "        pages=page,\n",
+    "        table_areas=page_def_templates[page][\"area\"],\n",
+    "        columns=page_def_templates[page][\"cols\"],\n",
+    "        flavor=\"stream\",\n",
+    "        split_text=True)\n",
+    "    \n",
+    "    print(\"Reading complete.\")\n",
+    "    \n",
+    "    df_inventory = tables_inventory_original[0].df.copy()\n",
+    "\n",
+    "    # move broken text in correct row (page 113 is fine)\n",
+    "    if page in ['110', '111', '112']:\n",
+    "        df_inventory.at[4, 0] = \"1.A.1 - Industries énergétiques\"\n",
+    "        df_inventory = df_inventory.drop(index=3)\n",
+    "        df_inventory.at[8, 0] = \"1.A.4 - Autres secteurs\"\n",
+    "        df_inventory = df_inventory.drop(index=7)\n",
+    "\n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header\"], inv_conf[\"unit\"]])\n",
+    "    df_inventory = pd.concat([df_header, df_inventory], axis=0, join='outer').reset_index(drop=True)\n",
+    "    df_inventory = pm2.pm2io.nir_add_unit_information(df_inventory,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "    \n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_inventory = df_inventory.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "\n",
+    "    # extract category from tuple\n",
+    "    df_inventory_long[\"orig_cat_name\"] = df_inventory_long[\"orig_cat_name\"].str[0] \n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_inventory_long[\"category\"] = df_inventory_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # replace cat names by codes in col \"category\"\n",
+    "    # first the manual replacements\n",
+    "    # TODO: move this to config section\n",
+    "    inv_conf[\"cat_codes_manual\"]['main'] = {\n",
+    "            'Éléments pour mémoire': 'MEMO',\n",
+    "            'Soutes internationales': 'M.BK',\n",
+    "            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',\n",
+    "            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "            'Total des émissions et absorptions nationales': \"0\",\n",
+    "            '2A5: Autre': '2A5', \n",
+    "        }\n",
+    "    df_inventory_long[\"category\"] = \\\n",
+    "        df_inventory_long[\"category\"].replace(inv_conf[\"cat_codes_manual\"]['main'])  \n",
+    "\n",
+    "    df_inventory_long[\"category\"] = df_inventory_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    \n",
+    "    # then the regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_inventory_long[\"category\"] = \\\n",
+    "        df_inventory_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "\n",
+    "    df_inventory_long = df_inventory_long.reset_index(drop=True)\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    df_inventory_long[\"data\"] = df_inventory_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_inventory_long[\"data\"] = df_inventory_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_inventory_long.columns = df_inventory_long.columns.map(str)\n",
+    "    df_inventory_long = df_inventory_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_all_dict[page] = df_inventory_long\n",
+    "\n",
+    "df_all = pd.concat([df_all_dict['110'], df_all_dict['111'], df_all_dict['112'], df_all_dict['113']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_all_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_all,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['main'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f1a4535e-3abc-45d0-9309-fd7991b1cb95",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.1, CO2, 2010.\n",
+      "[422.474]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 2, SO2, 1990.\n",
+      "[0.097]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.3.a.i, N2O, 2000.\n",
+      "[6.e-05]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 2.H.2, NMVOC, 2019.\n",
+      "[2.506]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.1, CH4, 2019.\n",
+      "[0.0011]\n",
+      "Value matches expected value.\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Test individual values from the tables ###\n",
+    "# TODO and note: this function is work in progress\n",
+    "# Use assert statements and print error message\n",
+    "# with category, entity, year, expected value and actual value\n",
+    "\n",
+    "### Test individual values from the tables ###\n",
+    "def assert_individual_value(\n",
+    "    df,\n",
+    "    category_column,\n",
+    "    entity_column,\n",
+    "    category,\n",
+    "    entity,\n",
+    "    year,\n",
+    "    expected_value\n",
+    "):\n",
+    "    arr = df.loc[(df[category_column] == category) & (df[entity_column] == entity), year].values\n",
+    "    print(arr)\n",
+    "    if len(arr) > 1:\n",
+    "        print(f\"More than one value found for {category}, {entity}, {year}!\")\n",
+    "\n",
+    "    # TODO: It looks like this will be true when the value equals 0\n",
+    "    if not arr:\n",
+    "        print((f\"No value found for {category}, {entity}, {year}!\"))\n",
+    "            \n",
+    "    if not arr[0] == expected_value:\n",
+    "        print(f\"Expected value {expected_value}, actual value is {arr[0]}\")\n",
+    "\n",
+    "    if arr[0] == expected_value:\n",
+    "        print(\"Value matches expected value.\")\n",
+    "\n",
+    "    return\n",
+    "\n",
+    "\n",
+    "test_cases = {\n",
+    "    \"1\" : {\n",
+    "        \"category\" : \"1.A.1\",\n",
+    "        'entity' : \"CO2\",\n",
+    "        \"year\" : \"2010\",\n",
+    "        \"expected_value\" : 422.474,\n",
+    "    },\n",
+    "    \"2\" : {\n",
+    "        \"category\" : \"2\",\n",
+    "        'entity' : \"SO2\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 0.097,\n",
+    "    },\n",
+    "    \"3\" : {\n",
+    "        \"category\" : \"1.A.3.a.i\",\n",
+    "        'entity' : \"N2O\",\n",
+    "        \"year\" : \"2000\",\n",
+    "        \"expected_value\" : 6e-5,\n",
+    "    },\n",
+    "    '4' : {\n",
+    "        \"category\" : \"2.H.2\",\n",
+    "        'entity' : \"NMVOC\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 2.506,\n",
+    "    },\n",
+    "    '5' : {\n",
+    "        \"category\" : \"1.A.1\",\n",
+    "        'entity' : \"CH4\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 0.0011,\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "for key in test_cases.keys():\n",
+    "    print(\"-\"*50)\n",
+    "    print(f\"Testing combination {test_cases[key][\"category\"]}, {test_cases[key][\"entity\"]}, {test_cases[key][\"year\"]}.\")\n",
+    "    assert_individual_value(\n",
+    "                    df = df_all_IF,\n",
+    "                    category_column = \"category (IPCC1996_2006_GIN_Inv)\",\n",
+    "                    entity_column = \"entity\",\n",
+    "                    category = test_cases[key][\"category\"],\n",
+    "                    entity = test_cases[key][\"entity\"],\n",
+    "                    year = test_cases[key][\"year\"],\n",
+    "                    expected_value = test_cases[key][\"expected_value\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "23258414-84b2-4a99-8f48-f471f5ebf75a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Unique values in column source\n",
+      "['GIN-GHG-Inventory']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column scenario (PRIMAP)\n",
+      "['BUR1']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column provenance\n",
+      "['measured']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column area (ISO3)\n",
+      "['GIN']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column entity\n",
+      "['CH4' 'CO' 'CO2' 'HFCS (AR4GWP100)' 'N2O' 'NMVOC' 'NOx'\n",
+      " 'PFCS (AR4GWP100)' 'SF6' 'SO2']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column unit\n",
+      "['Gg CH4 / yr' 'Gg CO / yr' 'Gg CO2 / yr' 'Gg N2O / yr' 'Gg NMVOC / yr'\n",
+      " 'Gg NOx / yr' 'Gg SF6 / yr' 'Gg SO2 / yr']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column category (IPCC1996_2006_GIN_Inv)\n",
+      "['0' '1' '1.A' '1.A.1' '1.A.2' '1.A.3' '1.A.4' '1.A.5' '1.B' '1.C' '2'\n",
+      " '2.A' '2.A.1' '2.A.2' '2.A.3' '2.A.4' '2.A.5' '2.B' '2.C' '2.C.1' '2.C.2'\n",
+      " '2.C.3' '2.C.4' '2.C.5' '2.C.6' '2.C.7' '2.D' '2.D.1' '2.D.2' '2.D.3'\n",
+      " '2.D.4' '2.E' '2.F' '2.F.1' '2.F.2' '2.F.3' '2.F.4' '2.F.5' '2.F.6' '2.G'\n",
+      " '2.H' '2.H.1' '2.H.2' '2.H.3' '3' '3.A' '3.A.1' '3.A.2' '3.B' '3.B.1'\n",
+      " '3.B.2' '3.B.3' '3.B.4' '3.B.5' '3.B.6' '3.C' '3.C.1' '3.C.2' '3.C.3'\n",
+      " '3.C.4' '3.C.5' '3.C.6' '3.C.7' '3.C.8' '3.D' '3.D.1' '3.D.2' '4' '4.A'\n",
+      " '4.B' '4.C' '4.D' '4.E' '5' 'M.BK' 'M.BK.A' 'M.BK.M' 'M.MULTIOP']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 1990\n",
+      "[ 6.5202000e+01  6.4650000e+00  3.2000000e-02  6.0000000e-03\n",
+      "  2.5000000e-02  6.4020000e+00  0.0000000e+00            nan\n",
+      "  5.6987000e+01  5.5634000e+01  5.3796000e+01  1.8381000e+00\n",
+      "  1.3530000e+00  1.7500000e+00  1.0290000e+00  2.1700000e-01\n",
+      "  5.0400000e-01  5.0000000e-06  1.6241700e+02  1.2418200e+02\n",
+      "  1.5800000e-01  2.8000000e-02  6.2990000e+00  1.1769600e+02\n",
+      "  3.8236000e+01 -1.6177575e+04  1.3104990e+03  8.1425700e+02\n",
+      "  1.5300000e+02  3.2603400e+02  1.7208000e+01  1.2779000e+01\n",
+      " -1.7502977e+04 -1.7499771e+04 -1.7508456e+04  6.9024000e+01\n",
+      " -6.0339000e+01  2.2800000e-01 -3.4340000e+00  2.1240000e+00\n",
+      "  7.1900000e-01  2.5770000e+00  1.1800000e-01  1.0000000e-03\n",
+      "  3.0000000e-02  8.0000000e-02  1.1344000e+01  2.1900000e+00\n",
+      "  1.2400000e-01  2.0660000e+00  2.6800000e-01  3.4000000e-03\n",
+      "  2.6500000e-01  2.0000000e-05  1.4312200e+01  1.3193000e+01\n",
+      "  5.2800000e-02  1.0000000e-02  1.2329000e+00  1.1897200e+01\n",
+      "  1.1192000e+00  1.0920000e+01  8.6260000e+00  2.1130000e+00\n",
+      "  3.9800000e-01  3.8580000e+00  2.2570000e+00  2.2940000e+00\n",
+      "  9.7000000e-02]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2000\n",
+      "[ 1.1998100e+02  6.4890000e+00  2.4000000e-02  1.8000000e-02\n",
+      "  1.5000000e-01  6.2970000e+00  0.0000000e+00            nan\n",
+      "  1.1056800e+02  1.0791100e+02  1.0429800e+02  3.6134500e+00\n",
+      "  2.6570000e+00  2.9250000e+00  2.0540000e+00  2.0900000e-01\n",
+      "  6.6200000e-01  1.5000000e-05  2.5243200e+02  1.7734200e+02\n",
+      "  1.1800000e-01  8.4000000e-02  3.6605000e+01  1.4053500e+02\n",
+      "  7.5090000e+01 -1.3893667e+04  1.8410300e+03  6.0736800e+02\n",
+      "  4.6044700e+02  7.6103000e+02  1.2185000e+01  1.5640000e+01\n",
+      " -1.5752375e+04 -1.5749970e+04 -1.5766453e+04  1.6484000e+01\n",
+      "  2.6800000e-01 -2.6730000e+00  2.0380000e+00  2.1580000e+00\n",
+      "  2.1500000e-01  4.7480000e+00  1.2500000e-01  5.0000000e-03\n",
+      "  4.0000000e-03  4.9000000e-02  6.8000000e-02  4.2690000e+00\n",
+      "  2.4300000e-01  4.0260000e+00  3.5400000e-01  3.0000000e-03\n",
+      "  3.5100000e-01  6.0000000e-05  1.8179000e+01  1.6697000e+01\n",
+      "  3.9000000e-02  3.0000000e-02  6.9480000e+00  9.6800000e+00\n",
+      "  1.4820000e+00  2.5060000e+00  1.7676000e+01  1.3170000e+01\n",
+      "  1.5740000e+00  1.1960000e+00  7.9620000e+00  2.4380000e+00\n",
+      "  4.5050000e+00  1.5600000e-01]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2010\n",
+      "[ 1.9700000e+02  4.8490000e+00  1.6000000e-02  2.8000000e-02\n",
+      "  3.1600000e-01  4.4890000e+00  0.0000000e+00            nan\n",
+      "  1.8761700e+02  1.8676900e+02  1.8045400e+02  6.3150000e+00\n",
+      "  8.4800000e-01  4.5340000e+00  3.3230000e+00  3.2000000e-01\n",
+      "  8.9200000e-01  3.0000000e-04  1.9571300e+02  1.7174700e+02\n",
+      "  8.2000000e-02  1.3300000e-01  7.7000000e+01  9.4532000e+01\n",
+      "  2.3966000e+01 -1.0691033e+04  2.3437780e+03  4.2247400e+02\n",
+      "  7.0899600e+02  1.2051170e+03  7.1920000e+00  1.9142000e+01\n",
+      " -1.3057077e+04 -1.3052876e+04 -1.3040518e+04  8.9270000e+00\n",
+      " -2.1284000e+01 -4.5210000e+00  3.1240000e+00  3.6900000e+01\n",
+      "  3.0400000e+00  7.5620000e+00  1.1900000e-01  3.0000000e-03\n",
+      "  6.0000000e-03  6.0000000e-02  5.1000000e-02  6.9670000e+00\n",
+      "  7.7000000e-02  6.8900000e+00  4.7500000e-01  5.0000000e-03\n",
+      "  4.7000000e-01  1.0000000e-03  2.7269600e+01  2.1977000e+01\n",
+      "  2.7300000e-02  4.6100000e-02  1.4539500e+01  7.3641000e+00\n",
+      "  5.2926000e+00  3.3038000e+00  1.9888000e+00  1.7748000e+01\n",
+      "  1.6310000e+01  1.0920000e+00  1.8330000e+00  1.1701000e+01\n",
+      "  1.6840000e+00  1.4380000e+00  1.0400000e-01]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2019\n",
+      "[ 3.120340e+02  5.866000e+00  1.100000e-03  3.270000e-02  4.584000e-01\n",
+      "  5.374200e+00  0.000000e+00           nan  2.995030e+02  2.985330e+02\n",
+      "  2.882390e+02  1.029400e+01  9.700000e-01  6.665000e+00  5.170000e+00\n",
+      "  3.570000e-01  1.138000e+00  5.000000e-04  2.626700e+02  2.244300e+02\n",
+      "  5.400000e-03  1.391000e-01  1.121100e+02  1.121800e+02  3.823600e+01\n",
+      " -9.360370e+03  3.037736e+03  2.735500e+01  8.249530e+02  2.168661e+03\n",
+      "  1.676700e+01  1.114700e+02  6.984600e+01  2.097200e+01  2.065200e+01\n",
+      " -1.251307e+04 -1.251512e+04 -1.254581e+04  3.068900e+01  2.053000e+00\n",
+      "  3.491000e+00  6.619700e+01  6.141000e+00  1.211700e+01  1.770000e-01\n",
+      "  2.000000e-04  6.500000e-03  1.090000e-01  6.100000e-02  1.134400e+01\n",
+      "  8.900000e-02  1.125500e+01  5.970000e-01  6.000000e-03  5.910000e-01\n",
+      "  2.000000e-03  3.632300e+01  3.004400e+01  5.400000e-02  2.124900e+01\n",
+      "  8.739000e+00  6.279000e+00  3.773000e+00  2.506000e+00  2.822000e+01\n",
+      "  2.592000e+01  7.200000e-02  2.163000e+00  2.168000e+01  2.009000e+00\n",
+      "  2.294000e+00  3.280000e-01]\n"
+     ]
+    }
+   ],
+   "source": [
+    "### check data for errors ###\n",
+    "# print a few things to see if it looks \"normal\"\n",
+    "for c in df_all_IF.columns:\n",
+    "    print('-'*50)\n",
+    "    print(f\"Unique values in column {c}\")\n",
+    "    print(df_all_IF[c].unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "07812254-fb73-4cb5-ae45-a96a2f2273d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-21 16:58:31.197\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78]], resulting in size 7,800.\u001b[0m\n",
+      "\u001b[32m2024-03-21 16:58:31.323\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88d4e68e-f1f4-4c7d-b710-c749296a16ca",
+   "metadata": {},
+   "source": [
+    "## 2. Read in sector tables for energy - pages 116, 117, 118, 119"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "251c3495-8506-4f43-9a97-094b5fb16947",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 116.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 117.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 118.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 119.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1.a</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1.a.i</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>373</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.A.5.b.iii</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>374</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.A.5.c</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>375</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.B</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>376</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>M.BK.M</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>377</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>M.MULTIOP</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>378 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "373  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "374  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "375  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "376  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "377  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)   1990   2000   2010   2019  \n",
+       "0    Gg CH4 / yr                                1  6.465  6.489  4.849  5.821  \n",
+       "1    Gg CH4 / yr                              1.A  6.465  6.489  4.849  5.821  \n",
+       "2    Gg CH4 / yr                            1.A.1  0.032  0.024  0.016  0.001  \n",
+       "3    Gg CH4 / yr                          1.A.1.a  0.032  0.024  0.016  0.001  \n",
+       "4    Gg CH4 / yr                        1.A.1.a.i  0.032  0.024  0.016  0.001  \n",
+       "..           ...                              ...    ...    ...    ...    ...  \n",
+       "373  Gg SO2 / yr                      1.A.5.b.iii    NaN    NaN    NaN    NaN  \n",
+       "374  Gg SO2 / yr                          1.A.5.c    NaN    NaN    NaN    NaN  \n",
+       "375  Gg SO2 / yr                              1.B    NaN    NaN    NaN    NaN  \n",
+       "376  Gg SO2 / yr                           M.BK.M    NaN    NaN    NaN    NaN  \n",
+       "377  Gg SO2 / yr                        M.MULTIOP    NaN    NaN    NaN    NaN  \n",
+       "\n",
+       "[378 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pages = ['116', '117', '118', '119']\n",
+    "df_energy_dict = {}\n",
+    "for page in pages:\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "        str(input_folder / pdf_file),\n",
+    "        pages=page,\n",
+    "        flavor=\"lattice\",\n",
+    "        split_text=True\n",
+    "        )\n",
+    "    \n",
+    "    print(\"Reading complete.\")\n",
+    "\n",
+    "    # cut last two lines of second table to ignore additional information regarding biomass for energy production \n",
+    "    df_energy_year = pd.concat([tables_inventory_original[0].df[2:],\n",
+    "                                tables_inventory_original[1].df[3:-2]],\n",
+    "                                axis=0,\n",
+    "                                join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    \n",
+    "    # drop duplicate lines - 1.A.3.d.i / 1.A.3.a.i / 1.A.5.c\n",
+    "    # TODO: better to find the index of the line and then drop it by the index\n",
+    "    df_energy_year = df_energy_year.drop(index=[27, 32, 50])  \n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_energy\"], inv_conf[\"unit_energy\"]])\n",
+    "\n",
+    "    df_energy_year = pd.concat([df_header, df_energy_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "    \n",
+    "    df_energy_year = pm2.pm2io.nir_add_unit_information(df_energy_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "    \n",
+    "    print(\"Added unit information.\")\n",
+    "    # set index\n",
+    "    df_energy_year = df_energy_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_energy_year_long = pm2.pm2io.nir_convert_df_to_long(df_energy_year, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    # extract from tuple\n",
+    "    df_energy_year_long[\"orig_cat_name\"] = df_energy_year_long[\"orig_cat_name\"].str[0] \n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # replace individual categories\n",
+    "    # TODO: move to config section\n",
+    "    inv_conf[\"cat_codes_manual\"]['energy'] = {\n",
+    "            'International Bunkers': 'MEMO',\n",
+    "            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',\n",
+    "            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "        }\n",
+    "\n",
+    "    # replace cat names by codes in col \"category\"\n",
+    "    # first the manual replacements\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"category\"].str.replace('\\n' ,'')\n",
+    "    df_energy_year_long[\"category\"] = \\\n",
+    "        df_energy_year_long[\"category\"].replace(inv_conf[\"cat_codes_manual\"]['energy'])\n",
+    "\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    \n",
+    "    inv_conf[\"cat_code_regexp\"] = r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "\n",
+    "    # then the regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_energy_year_long[\"category\"] = \\\n",
+    "        df_energy_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "\n",
+    "    df_energy_year_long = df_energy_year_long.reset_index(drop=True)\n",
+    "\n",
+    "    \n",
+    "    df_energy_year_long[\"data\"] = df_energy_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_energy_year_long[\"data\"] = df_energy_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_energy_year_long.columns = df_energy_year_long.columns.map(str)\n",
+    "    df_energy_year_long = df_energy_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_energy_dict[page] = df_energy_year_long\n",
+    "\n",
+    "df_energy = pd.concat([df_energy_dict['116'], df_energy_dict['117'], df_energy_dict['118'], df_energy_dict['119']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_energy_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_energy,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['energy'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_energy_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "64fa29dc-f62b-4010-bfed-8cd588675475",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.2.k, CH4, 1990.\n",
+      "[3.e-05]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.4.c.i, CO, 1990.\n",
+      "[0.0016]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.3.a.i, NMVOC, 2000.\n",
+      "[0.0002]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1, SO2, 2010.\n",
+      "[0.]\n",
+      "No value found for 1, SO2, 2010!\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.2.k, N2O, 2019.\n",
+      "[7.e-06]\n",
+      "Value matches expected value.\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_cases = {\n",
+    "    \"1\" : {\n",
+    "        \"category\" : \"1.A.2.k\",\n",
+    "        'entity' : \"CH4\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 3e-05,\n",
+    "    },\n",
+    "    \"2\" : {\n",
+    "        \"category\" : \"1.A.4.c.i\",\n",
+    "        'entity' : \"CO\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 0.0016,\n",
+    "    },\n",
+    "    \"3\" : {\n",
+    "        \"category\" : \"1.A.3.a.i\",\n",
+    "        'entity' : \"NMVOC\",\n",
+    "        \"year\" : \"2000\",\n",
+    "        \"expected_value\" : 0.0002,\n",
+    "    },\n",
+    "    '4' : {\n",
+    "        \"category\" : \"1\",\n",
+    "        'entity' : \"SO2\",\n",
+    "        \"year\" : \"2010\",\n",
+    "        \"expected_value\" : 0,\n",
+    "    },\n",
+    "    '5' : {\n",
+    "        \"category\" : \"1.A.2.k\",\n",
+    "        'entity' : \"N2O\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 7e-06,\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "for key in test_cases.keys():\n",
+    "    print(\"-\"*50)\n",
+    "    print(f\"Testing combination {test_cases[key][\"category\"]}, {test_cases[key][\"entity\"]}, {test_cases[key][\"year\"]}.\")\n",
+    "    assert_individual_value(\n",
+    "                    df = df_energy_IF,\n",
+    "                    category_column = \"category (IPCC1996_2006_GIN_Inv)\",\n",
+    "                    entity_column = \"entity\",\n",
+    "                    category = test_cases[key][\"category\"],\n",
+    "                    entity = test_cases[key][\"entity\"],\n",
+    "                    year = test_cases[key][\"year\"],\n",
+    "                    expected_value = test_cases[key][\"expected_value\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "bcf727f7-3474-4f2e-9bcb-ebdd140a14c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-21 17:25:29.863\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54]], resulting in size 2,646.\u001b[0m\n",
+      "\u001b[32m2024-03-21 17:25:29.940\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d557a318-ea3f-44ec-9187-c05da423fbca",
+   "metadata": {},
+   "source": [
+    "# 3. Read in LULUCF table - pages 124, 125, 126, 127"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "b4d117f0-6bfc-468f-b9f2-f66d5eaf8f1a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 124.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 125.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 126.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 127.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3</td>\n",
+       "      <td>56.987</td>\n",
+       "      <td>110.568</td>\n",
+       "      <td>187.617</td>\n",
+       "      <td>299.503</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A</td>\n",
+       "      <td>55.634</td>\n",
+       "      <td>107.911</td>\n",
+       "      <td>186.769</td>\n",
+       "      <td>298.533</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1</td>\n",
+       "      <td>53.796</td>\n",
+       "      <td>104.298</td>\n",
+       "      <td>180.454</td>\n",
+       "      <td>288.239</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1.a</td>\n",
+       "      <td>49.050</td>\n",
+       "      <td>94.967</td>\n",
+       "      <td>161.753</td>\n",
+       "      <td>256.319</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1.a.i</td>\n",
+       "      <td>10.488</td>\n",
+       "      <td>17.802</td>\n",
+       "      <td>27.091</td>\n",
+       "      <td>31.905</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>469</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.C.7</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>470</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.C.8</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>471</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>472</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D.1</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>473</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>474 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "469  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "470  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "471  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "472  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "473  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)    1990     2000     2010  \\\n",
+       "0    Gg CH4 / yr                                3  56.987  110.568  187.617   \n",
+       "1    Gg CH4 / yr                              3.A  55.634  107.911  186.769   \n",
+       "2    Gg CH4 / yr                            3.A.1  53.796  104.298  180.454   \n",
+       "3    Gg CH4 / yr                          3.A.1.a  49.050   94.967  161.753   \n",
+       "4    Gg CH4 / yr                        3.A.1.a.i  10.488   17.802   27.091   \n",
+       "..           ...                              ...     ...      ...      ...   \n",
+       "469  Gg NOx / yr                            3.C.7   0.000    0.000    0.000   \n",
+       "470  Gg NOx / yr                            3.C.8   0.000    0.000    0.000   \n",
+       "471  Gg NOx / yr                              3.D   0.000    0.000    0.000   \n",
+       "472  Gg NOx / yr                            3.D.1   0.000    0.000    0.000   \n",
+       "473  Gg NOx / yr                            3.D.2   0.000    0.000    0.000   \n",
+       "\n",
+       "        2019  \n",
+       "0    299.503  \n",
+       "1    298.533  \n",
+       "2    288.239  \n",
+       "3    256.319  \n",
+       "4     31.905  \n",
+       "..       ...  \n",
+       "469    0.000  \n",
+       "470    0.000  \n",
+       "471    0.000  \n",
+       "472    0.000  \n",
+       "473    0.000  \n",
+       "\n",
+       "[474 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pages = ['124', '125', '126', '127']\n",
+    "df_lulucf_dict = {}\n",
+    "for page in pages:\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    "    )\n",
+    "    print(\"Reading complete.\")\n",
+    "\n",
+    "    if page == '127':\n",
+    "        # table on page 127 has one extra row at the top\n",
+    "        # and one extra category 3.A.1.j\n",
+    "        df_lulucf_year = tables_inventory_original[0].df[3:]\n",
+    "        # rename duplicate categories in tables\n",
+    "        # TODO move to config section\n",
+    "        replace_categories = [(19, \"3.A.2.a.i - Vaches laitières\"),\n",
+    "                              (20, \"3.A.2.a.ii - Autres bovins\"),\n",
+    "                              (21, \"3.A.2.b - Buffle\"),\n",
+    "                              (22, \"3.A.2.c - Ovins\"),\n",
+    "                              (23, \"3.A.2.d - Caprins\"),\n",
+    "                              (24, \"3.A.2.e - Chameaux\"),\n",
+    "                              (25, \"3.A.2.f - Chevaux\"),\n",
+    "                              (26, \"3.A.2.g - Mules et ânes\"),\n",
+    "                              (27, \"3.A.2.h - Porcins\"),\n",
+    "                              (28, \"3.A.2.i - Volailles\"),\n",
+    "                              (29, \"3.A.2.j - Autres (préciser)\"),]\n",
+    "        for index, category_name in  replace_categories:\n",
+    "            df_lulucf_year.at[index, 0] = category_name\n",
+    "    else:\n",
+    "        # cut first two lines\n",
+    "        df_lulucf_year = tables_inventory_original[0].df[2:] \n",
+    "\n",
+    "        # TODO move to config section\n",
+    "        replace_categories = [(17, \"3.A.2.a.i - Vaches laitières\"),\n",
+    "                              (18, \"3.A.2.a.ii - Autres bovins\"),\n",
+    "                              (19, \"3.A.2.b - Buffle\"),\n",
+    "                              (20, \"3.A.2.c - Ovins\"),\n",
+    "                              (21, \"3.A.2.d - Caprins\"),\n",
+    "                              (22, \"3.A.2.e - Chameaux\"),\n",
+    "                              (23, \"3.A.2.f - Chevaux\"),\n",
+    "                              (24, \"3.A.2.g - Mules et ânes\"),\n",
+    "                              (25, \"3.A.2.h - Porcins\"),\n",
+    "                              (26, \"3.A.2.i - Volailles\"),]\n",
+    "        for index, category_name in  replace_categories:\n",
+    "            df_lulucf_year.at[index, 0] = category_name\n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_lulucf\"], inv_conf[\"unit_lulucf\"]])\n",
+    "\n",
+    "    df_lulucf_year = pd.concat([df_header, df_lulucf_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    df_lulucf_year = pm2.pm2io.nir_add_unit_information(df_lulucf_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "\n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_lulucf_year = df_lulucf_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_lulucf_year_long = pm2.pm2io.nir_convert_df_to_long(df_lulucf_year, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    df_lulucf_year_long[\"orig_cat_name\"] = df_lulucf_year_long[\"orig_cat_name\"].str[0] # extract from tuple\n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_lulucf_year_long[\"category\"] = df_lulucf_year_long[\"orig_cat_name\"]\n",
+    "   \n",
+    "    # regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_lulucf_year_long[\"category\"] = \\\n",
+    "        df_lulucf_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_lulucf_year_long = df_lulucf_year_long.reset_index(drop=True)\n",
+    "    \n",
+    "    df_lulucf_year_long[\"data\"] = df_lulucf_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_lulucf_year_long[\"data\"] = df_lulucf_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_lulucf_year_long.columns = df_lulucf_year_long.columns.map(str)\n",
+    "    df_lulucf_year_long = df_lulucf_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_lulucf_dict[page] = df_lulucf_year_long\n",
+    "\n",
+    "df_lulucf = pd.concat([df_lulucf_dict['124'], df_lulucf_dict['125'], df_lulucf_dict['126'], df_lulucf_dict['127']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_lulucf_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_lulucf,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['lulucf'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_lulucf_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "8d132ea2-655a-4363-9171-b81904a7d6d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:22:15.333\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79]], resulting in size 2,844.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:22:15.408\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_lulucf = pm2.pm2io.from_interchange_format(df_lulucf_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a99c689e-1f26-42d5-8974-194373ce26f6",
+   "metadata": {},
+   "source": [
+    "# 3. Read in Waste tables - pages 128, 130"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "fcf17dba-6af4-400f-9ec3-b5dd5b1b0a82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# There are three tables for three years on page 128\n",
+    "# and another tabel on page 130\n",
+    "\n",
+    "# read three tables\n",
+    "page = '128'\n",
+    "tables_inventory_original_128 = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    ")\n",
+    "\n",
+    "# read last table\n",
+    "page = '130'\n",
+    "tables_inventory_original_130 = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    ")\n",
+    "\n",
+    "# save to dict\n",
+    "df_waste_years = {\n",
+    "    '1990' : tables_inventory_original_128[0].df,\n",
+    "    '2000' : tables_inventory_original_128[1].df,\n",
+    "    '2010' : tables_inventory_original_128[2].df,\n",
+    "    '2019' : tables_inventory_original_130[0].df,\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "4e0afb6e-db8b-41ae-b02d-e4a5d54ea5ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Processing table for 1990.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2000.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2010.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2019.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1.750</td>\n",
+       "      <td>2.925</td>\n",
+       "      <td>4.534</td>\n",
+       "      <td>6.665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A</td>\n",
+       "      <td>1.029</td>\n",
+       "      <td>2.054</td>\n",
+       "      <td>3.323</td>\n",
+       "      <td>5.170</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.2</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.3</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>86</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.C.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>87</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>88</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D.1</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>90</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.E</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>91 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                ...               ...        ...         ...    ...   \n",
+       "86  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "87  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "88  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "89  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "90  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "\n",
+       "           unit category (IPCC1996_2006_GIN_Inv)   1990   2000   2010   2019  \n",
+       "0   Gg CH4 / yr                                4  1.750  2.925  4.534  6.665  \n",
+       "1   Gg CH4 / yr                              4.A  1.029  2.054  3.323  5.170  \n",
+       "2   Gg CH4 / yr                            4.A.1    NaN    NaN    NaN    NaN  \n",
+       "3   Gg CH4 / yr                            4.A.2    NaN    NaN    NaN    NaN  \n",
+       "4   Gg CH4 / yr                            4.A.3    NaN    NaN    NaN    NaN  \n",
+       "..          ...                              ...    ...    ...    ...    ...  \n",
+       "86  Gg SO2 / yr                            4.C.2  0.000  0.000  0.000  0.000  \n",
+       "87  Gg SO2 / yr                              4.D  0.000  0.000  0.000  0.000  \n",
+       "88  Gg SO2 / yr                            4.D.1  0.000  0.000  0.000  0.000  \n",
+       "89  Gg SO2 / yr                            4.D.2  0.000  0.000  0.000  0.000  \n",
+       "90  Gg SO2 / yr                              4.E  0.000  0.000  0.000  0.000  \n",
+       "\n",
+       "[91 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_waste_dict = {}\n",
+    "for year in df_waste_years.keys():\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Processing table for {year}.\")\n",
+    "\n",
+    "    df_waste_year = df_waste_years[year][2:]\n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_waste\"], inv_conf[\"unit_waste\"]])\n",
+    "\n",
+    "    df_waste_year = pd.concat([df_header, df_waste_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    df_waste_year = pm2.pm2io.nir_add_unit_information(df_waste_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "\n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_waste_year = df_waste_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_waste_year_long = pm2.pm2io.nir_convert_df_to_long(df_waste_year, year,\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    df_waste_year_long[\"orig_cat_name\"] = df_waste_year_long[\"orig_cat_name\"].str[0]\n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_waste_year_long[\"category\"] = df_waste_year_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_waste_year_long[\"category\"] = \\\n",
+    "        df_waste_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_waste_year_long = df_waste_year_long.reset_index(drop=True)\n",
+    "\n",
+    "    df_waste_year_long[\"category\"] = df_waste_year_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    df_waste_year_long[\"data\"] = df_waste_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_waste_year_long[\"data\"] = df_waste_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_waste_year_long.columns = df_waste_year_long.columns.map(str)\n",
+    "    df_waste_year_long = df_waste_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_waste_dict[year] = df_waste_year_long\n",
+    "\n",
+    "df_waste = pd.concat([df_waste_dict['1990'], df_waste_dict['2000'], df_waste_dict['2010'], df_waste_dict['2019']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_waste_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_waste,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['waste'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_waste_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "6628eacb-8a24-415b-a42e-04e929976f83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:27:11.859\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13]], resulting in size 637.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:27:11.898\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_waste = pm2.pm2io.from_interchange_format(df_waste_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba512153-1c65-4568-9bae-817fbf9cc9b3",
+   "metadata": {},
+   "source": [
+    "# 4. Read in trend tables - pages 131 - 137"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "0e71c7b2-c301-4048-8b92-c9fc58a2501f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table for page 131 and entity CO2.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 132 and entity CH4.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 133 and entity N2O.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 134 and entity NOx.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 135 and entity CO.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 136 and entity NMVOCs.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 137 and entity SO2.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>1995</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2005</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2015</th>\n",
+       "      <th>2018</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>0</td>\n",
+       "      <td>65.202</td>\n",
+       "      <td>93.368</td>\n",
+       "      <td>119.981</td>\n",
+       "      <td>152.272</td>\n",
+       "      <td>196.057</td>\n",
+       "      <td>253.025</td>\n",
+       "      <td>296.416</td>\n",
+       "      <td>312.034</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>7.066</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>5.984</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.360</td>\n",
+       "      <td>5.931</td>\n",
+       "      <td>5.866</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>7.066</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>5.984</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.360</td>\n",
+       "      <td>5.931</td>\n",
+       "      <td>5.866</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.020</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.002</td>\n",
+       "      <td>0.005</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.2</td>\n",
+       "      <td>0.006</td>\n",
+       "      <td>0.012</td>\n",
+       "      <td>0.018</td>\n",
+       "      <td>0.023</td>\n",
+       "      <td>0.028</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.026</td>\n",
+       "      <td>0.033</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>5</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>152</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK</td>\n",
+       "      <td>0.719</td>\n",
+       "      <td>1.438</td>\n",
+       "      <td>2.158</td>\n",
+       "      <td>19.529</td>\n",
+       "      <td>36.900</td>\n",
+       "      <td>21.840</td>\n",
+       "      <td>51.718</td>\n",
+       "      <td>66.197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>153</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK.A</td>\n",
+       "      <td>0.719</td>\n",
+       "      <td>1.438</td>\n",
+       "      <td>2.158</td>\n",
+       "      <td>19.529</td>\n",
+       "      <td>36.900</td>\n",
+       "      <td>21.840</td>\n",
+       "      <td>51.718</td>\n",
+       "      <td>66.197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>154</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK.M</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>155</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.MULTIOP</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>156 rows × 15 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "151  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "152  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "153  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "154  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "155  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)    1990    1995     2000  \\\n",
+       "0    Gg CH4 / yr                                0  65.202  93.368  119.981   \n",
+       "1    Gg CH4 / yr                                1   6.465   7.066    6.489   \n",
+       "2    Gg CH4 / yr                              1.A   6.465   7.066    6.489   \n",
+       "3    Gg CH4 / yr                            1.A.1   0.032   0.027    0.024   \n",
+       "4    Gg CH4 / yr                            1.A.2   0.006   0.012    0.018   \n",
+       "..           ...                              ...     ...     ...      ...   \n",
+       "151  Gg CO2 / yr                                5     NaN     NaN      NaN   \n",
+       "152  Gg CO2 / yr                             M.BK   0.719   1.438    2.158   \n",
+       "153  Gg CO2 / yr                           M.BK.A   0.719   1.438    2.158   \n",
+       "154  Gg CO2 / yr                           M.BK.M     NaN     NaN      NaN   \n",
+       "155  Gg CO2 / yr                        M.MULTIOP   0.000   0.000    0.000   \n",
+       "\n",
+       "        2005     2010     2015     2018     2019  \n",
+       "0    152.272  196.057  253.025  296.416  312.034  \n",
+       "1      5.984    4.849    5.360    5.931    5.866  \n",
+       "2      5.984    4.849    5.360    5.931    5.866  \n",
+       "3      0.020    0.016    0.002    0.005    0.001  \n",
+       "4      0.023    0.028    0.024    0.026    0.033  \n",
+       "..       ...      ...      ...      ...      ...  \n",
+       "151      NaN      NaN      NaN      NaN      NaN  \n",
+       "152   19.529   36.900   21.840   51.718   66.197  \n",
+       "153   19.529   36.900   21.840   51.718   66.197  \n",
+       "154      NaN      NaN      NaN      NaN      NaN  \n",
+       "155    0.000    0.000    0.000    0.000    0.000  \n",
+       "\n",
+       "[156 rows x 15 columns]"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#%matplotlib widget \n",
+    "#camelot.plot(tables_inventory_original[0], kind='text')\n",
+    "\n",
+    "df_main_dict = {}\n",
+    "pages = ['131', '132', '133', '134', '135', '136', '137']\n",
+    "entities = ['CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOCs', 'SO2']\n",
+    "\n",
+    "# for this set of tables every page is a different entity\n",
+    "for page, entity in zip(pages, entities):\n",
+    "\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table for page {page} and entity {entity}.\")\n",
+    "    \n",
+    "    # first table needs to be read in with flavor=\"stream\"\n",
+    "    # flavor=\"lattice\" raises an error, maybe camelot issue\n",
+    "    # see https://github.com/atlanhq/camelot/issues/306\n",
+    "    # or because characters in first row almost reach\n",
+    "    # the table grid    \n",
+    "    if page == '131':\n",
+    "        tables_inventory_original = camelot.read_pdf(\n",
+    "            str(input_folder / pdf_file),\n",
+    "            pages=page,\n",
+    "            table_areas=page_def_templates[page][\"area\"],\n",
+    "            columns=page_def_templates[page][\"cols\"],\n",
+    "            flavor=\"stream\",\n",
+    "            split_text=True\n",
+    "        )\n",
+    "        \n",
+    "        df_trend_entity = tables_inventory_original[0].df[1:]\n",
+    "    else:\n",
+    "        tables_inventory_original = camelot.read_pdf(\n",
+    "            str(input_folder / pdf_file),\n",
+    "            pages=page,\n",
+    "            flavor=\"lattice\",\n",
+    "            split_text=True)\n",
+    "        df_trend_entity = tables_inventory_original[0].df[3:]\n",
+    "\n",
+    "    print(f\"Reading complete.\")\n",
+    "\n",
+    "    # add columns\n",
+    "    # 'data' prefix is needed for pd.wide_to_long() later\n",
+    "    columns_years = ['data1990', 'data1995', \"data2000\", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']\n",
+    "    df_trend_entity.columns = ['orig_cat_name'] + columns_years\n",
+    "    \n",
+    "    # unit is always Gg\n",
+    "    df_trend_entity['unit'] = 'Gg'\n",
+    "    \n",
+    "    # only one entity per table\n",
+    "    df_trend_entity['entity'] = entity\n",
+    "    \n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"orig_cat_name\"]\n",
+    "\n",
+    "    # delete rows that are just a headline or empty\n",
+    "    #row_to_delete = df_trend_entity.index[df_trend_entity['category'] == 'Éléments pour mémoire'][0]\n",
+    "    #df_trend_entity = df_trend_entity.drop(index = row_to_delete)\n",
+    "\n",
+    "    # in the first table there is no empty line\n",
+    "    if page != '131':\n",
+    "        row_to_delete = df_trend_entity.index[df_trend_entity['category'] == ''][0]\n",
+    "        df_trend_entity = df_trend_entity.drop(index = row_to_delete)\n",
+    "        \n",
+    "    inv_conf[\"cat_code_regexp\"] = r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].replace(\n",
+    "        {\n",
+    "         'Total des émissions et absorptions nationales': \"0\",\n",
+    "         '2A5: Autre' : '2A5',\n",
+    "         'Éléments pour mémoire': 'MEMO',\n",
+    "         'Soutes internationales' : 'M.BK',\n",
+    "         '1.A.3.a.i - Aviation internationale (soutes internationales)' : 'M.BK.A',\n",
+    "         '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "         '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "        })\n",
+    "\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].str.replace(\".\", \"\")\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].str.replace(\"\\n\", \"\")\n",
+    "    \n",
+    "    \n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_trend_entity[\"category\"] = \\\n",
+    "        df_trend_entity[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_trend_entity = df_trend_entity.reset_index(drop=True)\n",
+    "    \n",
+    "    print(f\"Created category codes.\")\n",
+    "    \n",
+    "    for year in columns_years:\n",
+    "        df_trend_entity[year] = df_trend_entity[year].str.replace(\",\", \".\")\n",
+    "        df_trend_entity[year] = df_trend_entity[year].str.replace(\"NE1\", \"NE\")\n",
+    "    \n",
+    "    # make sure all col headers are str\n",
+    "    df_trend_entity.columns = df_trend_entity.columns.map(str)\n",
+    "    \n",
+    "    df_trend_entity = df_trend_entity.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_trend_entity_long = pd.wide_to_long(df_trend_entity, stubnames='data',  i='category', j='time')\n",
+    "    \n",
+    "    print(f\"Converted to long format.\")\n",
+    "    \n",
+    "    df_trend_entity_long = df_trend_entity_long.reset_index()\n",
+    "    \n",
+    "    df_main_dict[page] =  df_trend_entity_long\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "\n",
+    "df_trend_all = pd.concat([df_main_dict['131'], df_main_dict['132']], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "df_trend_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_trend_all,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['trend'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_trend_IF\n",
+    "       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "05e1ad4f-c35c-460c-8546-5e493f363739",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:52:43.765\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 2, 78], [1, 1, 1, 1, 2, 78]], resulting in size 312.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:52:43.826\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3b65227-b7c4-4d18-89ef-af927c9a81b5",
+   "metadata": {},
+   "source": [
+    "# Combine tables and save to IF and native format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "960117b6-28fc-45ba-a768-16f63e428875",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 10:09:36.801\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for CH4\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.026\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for CO2\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.187\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for N2O\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.351\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for SO2\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.448\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for NMVOC\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.533\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge_with_tolerance_core\u001b[0m:\u001b[36m74\u001b[0m - \u001b[31m\u001b[1mpr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\n",
+      "shown are relative discrepancies.\n",
+      "               NMVOC\n",
+      "time                \n",
+      "1990-01-01  0.800000\n",
+      "2000-01-01  0.800000\n",
+      "2010-01-01  0.869848\u001b[0m\n"
+     ]
+    },
+    {
+     "ename": "MergeError",
+     "evalue": "pr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\nshown are relative discrepancies.\n               NMVOC\ntime                \n1990-01-01  0.800000\n2000-01-01  0.800000\n2010-01-01  0.869848",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mMergeError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[72], line 10\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#### combine\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m#data_pm2_main\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      8\u001b[0m \n\u001b[1;32m      9\u001b[0m \u001b[38;5;66;03m# tolerance needs to be high as rounding in trend tables leads to inconsistent data\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m data_pm2 \u001b[38;5;241m=\u001b[39m \u001b[43mdata_pm2_main\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_pm2_energy\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.11\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Documents/UNFCCC_non-AnnexI_data/venv/lib/python3.12/site-packages/primap2/_merge.py:231\u001b[0m, in \u001b[0;36mDatasetMergeAccessor.merge\u001b[0;34m(self, ds_merge, tolerance, error_on_discrepancy, combine_attrs)\u001b[0m\n\u001b[1;32m    229\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m var \u001b[38;5;129;01min\u001b[39;00m vars_common:\n\u001b[1;32m    230\u001b[0m     logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmerging for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 231\u001b[0m     ds_result_new \u001b[38;5;241m=\u001b[39m \u001b[43mmerge_with_tolerance_core\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    232\u001b[0m \u001b[43m        \u001b[49m\u001b[43mda_start\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mds_start\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    233\u001b[0m \u001b[43m        \u001b[49m\u001b[43mda_merge\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mds_merge\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtolerance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    235\u001b[0m \u001b[43m        \u001b[49m\u001b[43merror_on_discrepancy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_on_discrepancy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    237\u001b[0m     ds_result \u001b[38;5;241m=\u001b[39m xr\u001b[38;5;241m.\u001b[39mmerge([ds_result, ds_result_new], combine_attrs\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverride\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds_result\n",
+      "File \u001b[0;32m~/Documents/UNFCCC_non-AnnexI_data/venv/lib/python3.12/site-packages/primap2/_merge.py:75\u001b[0m, in \u001b[0;36mmerge_with_tolerance_core\u001b[0;34m(da_start, da_merge, tolerance, error_on_discrepancy)\u001b[0m\n\u001b[1;32m     73\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_on_discrepancy:\n\u001b[1;32m     74\u001b[0m     logger\u001b[38;5;241m.\u001b[39merror(log_message)\n\u001b[0;32m---> 75\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m xr\u001b[38;5;241m.\u001b[39mMergeError(log_message)\n\u001b[1;32m     76\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     77\u001b[0m     \u001b[38;5;66;03m# log warning, continue with merging\u001b[39;00m\n\u001b[1;32m     78\u001b[0m     logger\u001b[38;5;241m.\u001b[39mwarning(log_message)\n",
+      "\u001b[0;31mMergeError\u001b[0m: pr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\nshown are relative discrepancies.\n               NMVOC\ntime                \n1990-01-01  0.800000\n2000-01-01  0.800000\n2010-01-01  0.869848"
+     ]
+    }
+   ],
+   "source": [
+    "#### combine\n",
+    "\n",
+    "#data_pm2_main\n",
+    "#data_pm2_trend\n",
+    "#data_pm2_energy\n",
+    "#data_pm2_lulucf\n",
+    "#data_pm2_waste\n",
+    "\n",
+    "# tolerance needs to be high as rounding in trend tables leads to inconsistent data\n",
+    "data_pm2 = data_pm2_main.pr.merge(data_pm2_energy,tolerance=0.11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cb74c9e-b400-454b-848a-28091b832016",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert back to IF to have units in the fixed format\n",
+    "data_if = data_pm2.pr.to_interchange_format()\n",
+    "\n",
+    "# ###\n",
+    "# save data to IF and native format\n",
+    "# ###\n",
+    "pm2.pm2io.write_interchange_format(\n",
+    "    output_folder / (output_filename + coords_terminologies[\"category\"] + \"_raw\"), data_if)\n",
+    "\n",
+    "encoding = {var: compression for var in data_pm2.data_vars}\n",
+    "data_pm2.pr.to_netcdf(\n",
+    "    output_folder / (output_filename + coords_terminologies[\"category\"] + \"_raw.nc\"),\n",
+    "    encoding=encoding)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 2570 - 0
UNFCCC_GHG_data/UNFCCC_reader/Guinea/Guinea_BUR1_test_v3.ipynb

@@ -0,0 +1,2570 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a8f3f028-ef62-4014-b911-7a61d24e3dae",
+   "metadata": {},
+   "source": [
+    "### ToDos\n",
+    "- check if unit row lenght is correct"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "461e34a0-47b1-44a7-ba1a-77db66ea783a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set environment variable (only for jupyter notebook)\n",
+    "import os\n",
+    "os.environ[\"UNFCCC_GHG_ROOT_PATH\"] = \"/Users/danielbusch/Documents/UNFCCC_non-AnnexI_data\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "83dd87db-4956-4bb1-937a-84629bfce95b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import camelot\n",
+    "import primap2 as pm2\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c37d6d49-076c-4823-a486-83fbda3fa33f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ###\n",
+    "# configuration\n",
+    "# ###\n",
+    "\n",
+    "input_folder = downloaded_data_path / 'UNFCCC' / 'Guinea' / 'BUR1'\n",
+    "output_folder = extracted_data_path / 'UNFCCC' / 'Guinea'\n",
+    "if not output_folder.exists():\n",
+    "    output_folder.mkdir()\n",
+    "\n",
+    "pdf_file = \"Rapport_IGES-Guinee-BUR1_VF.pdf\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "87bf46ce-441e-4247-b62a-ce5ebcf26cb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# primap2 format conversion\n",
+    "coords_cols = {\n",
+    "    \"category\": \"category\",\n",
+    "    \"entity\": \"entity\",\n",
+    "    \"unit\": \"unit\",\n",
+    "}\n",
+    "\n",
+    "coords_defaults = {\n",
+    "    \"source\": \"GIN-GHG-Inventory\",\n",
+    "    \"provenance\": \"measured\",\n",
+    "    \"area\": \"GIN\",\n",
+    "    \"scenario\": \"BUR1\",\n",
+    "}\n",
+    "\n",
+    "coords_terminologies = {\n",
+    "    \"area\": \"ISO3\",\n",
+    "    # TODO check if this is correct\n",
+    "    \"category\": \"IPCC1996_2006_GIN_Inv\",\n",
+    "    \"scenario\": \"PRIMAP\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23676d59-d7e9-455c-b713-7ce98b92d5d7",
+   "metadata": {},
+   "source": [
+    "### Q: How to choose gwp_to_use?\n",
+    "### Q: 'unit' and 'category' are 'PRIMAP1'. Are there other options?\n",
+    "### Q: Why are we mapping 'NMVOCs': 'NMVOC', wouldn't it be easier to name it NMVOC in the first place?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "953ddab6-07ee-4b60-82f0-f2e9ca76b1a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Are we choosing this gwp\n",
+    "gwp_to_use = \"AR4GWP100\"\n",
+    "coords_value_mapping = {\n",
+    "    'main' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'HFCs': f\"HFCS ({gwp_to_use})\",\n",
+    "        'PFCs': f\"PFCS ({gwp_to_use})\",\n",
+    "        'SF6' : f\"SF6 ({gwp_to_use})\",\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'energy' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'lulucf' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'waste' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    \"entity\": {\n",
+    "        'NMVOCs': 'NMVOC',\n",
+    "    }\n",
+    "    },\n",
+    "    'trend' : {\n",
+    "    \"unit\": \"PRIMAP1\",\n",
+    "    \"category\": \"PRIMAP1\",\n",
+    "    },\n",
+    "    \n",
+    "}\n",
+    "\n",
+    "\n",
+    "filter_remove = {\n",
+    "    'f_memo': {\"category\": \"MEMO\"},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef888811-5803-4df7-8fd8-06830e6d9bce",
+   "metadata": {},
+   "source": [
+    "### Q: What to put under references and rights?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "23b39c1a-700c-46f9-a3f5-33549658ad69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "meta_data = {\n",
+    "    \"references\": \"placeholder\",\n",
+    "    \"rights\": \"\",\n",
+    "    \"contact\": \"mail@johannes-guetschow.de\",\n",
+    "    \"title\": \"Guinea. Biennial update report (BUR). BUR1\",\n",
+    "    \"comment\": \"Read fom pdf by Daniel Busch\",\n",
+    "    \"institution\": \"UNFCCC\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2390fb91-d976-47f9-9236-a6c838e1fd56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page_def_templates = {\n",
+    "    '110': {\n",
+    "        \"area\": ['36,718,589,87'],\n",
+    "        \"cols\": ['290,340,368,392,425,445,465,497,535,564'],\n",
+    "    },\n",
+    "    '111': {\n",
+    "        \"area\": ['36,736,587,107'],\n",
+    "        \"cols\": ['293,335,369,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '112': {\n",
+    "        \"area\": ['35,733,588,106'],\n",
+    "        \"cols\": ['293,335,369,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '113': {\n",
+    "        \"area\": ['35,733,588,106'],\n",
+    "        \"cols\": ['293,335,365,399,424,445,468,497,535,565'],\n",
+    "    },\n",
+    "    '131' : {\n",
+    "                \"area\": ['36,718,590,83'],\n",
+    "                \"cols\": ['293,332,370,406,442,480,516,554'],\n",
+    "            },\n",
+    "}\n",
+    "\n",
+    "# for main table\n",
+    "header_inventory = ['Greenhouse gas source and sink categories',\n",
+    "                   'CO2', 'CH4', \"N2O\", 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'NMVOCs','SO2'\n",
+    "                   ]\n",
+    "# TODO the extra '-' may be wrong here, check again!\n",
+    "unit_inventory = ['-'] + ['Gg'] * len(header_inventory) # one extra for the category columns\n",
+    "unit_inventory[4] = \"GgCO2eq\"\n",
+    "unit_inventory[5] = \"GgCO2eq\"\n",
+    "unit_inventory[6] = \"GgCO2eq\"\n",
+    "\n",
+    "# for energy tables\n",
+    "header_energy = ['Greenhouse gas source and sink categories',\n",
+    "                   'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs','SO2'\n",
+    "                   ]\n",
+    "unit_energy = ['-'] + ['Gg'] * len(header_energy) # one extra for the category columns\n",
+    "\n",
+    "# for lulucf tables\n",
+    "header_lulucf = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs']\n",
+    "unit_lulucf = ['-'] + ['Gg'] * (len(header_lulucf) - 1)\n",
+    "\n",
+    "# for waste table\n",
+    "header_waste = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', \"N2O\", 'NOx', 'CO', 'NMVOCs', 'SO2']\n",
+    "unit_waste = ['-'] + ['Gg'] * (len(header_waste) - 1)\n",
+    "\n",
+    "# for trend table (unit is always Gg for this table)\n",
+    "header_trend = ['data1990', 'data1995', \"data2000\", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']\n",
+    "\n",
+    "\n",
+    "# define config dict\n",
+    "inv_conf = {\n",
+    "    'header': header_inventory,\n",
+    "    'unit': unit_inventory,\n",
+    "    'header_energy' : header_energy,\n",
+    "    'unit_energy' : unit_energy,\n",
+    "    'header_lulucf' : header_lulucf,\n",
+    "    'unit_lulucf' : unit_lulucf,\n",
+    "    'header_waste' : header_waste,\n",
+    "    'unit_waste' : unit_waste,\n",
+    "    'header_trend' : header_trend,\n",
+    "    'entity_row': 0,\n",
+    "    'unit_row': 1,\n",
+    "    'index_cols': \"Greenhouse gas source and sink categories\",\n",
+    "    'year': {'110' : 1990,\n",
+    "             '111' : 2000,\n",
+    "             '112' : 2010,\n",
+    "             '113' : 2019,\n",
+    "             '116' : 1990,\n",
+    "             '117' : 2000,\n",
+    "             '118' : 2010,\n",
+    "             '119' : 2019,\n",
+    "             '124' : 1990,\n",
+    "             '125' : 2000,\n",
+    "             '126' : 2010,\n",
+    "             '127' : 2019,\n",
+    "            },\n",
+    "    'header_long': [\"orig_cat_name\", \"entity\", \"unit\", \"time\", \"data\"],\n",
+    "    \"cat_code_regexp\" : r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd0b97f8-acbb-4df1-9764-b2d0f6af39ba",
+   "metadata": {},
+   "source": [
+    "## 1. Read main tables - pages 110, 111, 112, 113"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4357ddd0-e9ee-4b2b-a765-c36411df63e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 110.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 111.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 112.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 113.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pages = ['110', '111', '112', '113']\n",
+    "df_all_dict = {}\n",
+    "for page in pages:\n",
+    "    \n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "        str(input_folder / pdf_file),\n",
+    "        pages=page,\n",
+    "        table_areas=page_def_templates[page][\"area\"],\n",
+    "        columns=page_def_templates[page][\"cols\"],\n",
+    "        flavor=\"stream\",\n",
+    "        split_text=True)\n",
+    "    \n",
+    "    print(\"Reading complete.\")\n",
+    "    \n",
+    "    df_inventory = tables_inventory_original[0].df.copy()\n",
+    "\n",
+    "    # move broken text in correct row (page 113 is fine)\n",
+    "    if page in ['110', '111', '112']:\n",
+    "        df_inventory.at[4, 0] = \"1.A.1 - Industries énergétiques\"\n",
+    "        df_inventory = df_inventory.drop(index=3)\n",
+    "        df_inventory.at[8, 0] = \"1.A.4 - Autres secteurs\"\n",
+    "        df_inventory = df_inventory.drop(index=7)\n",
+    "\n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header\"], inv_conf[\"unit\"]])\n",
+    "    df_inventory = pd.concat([df_header, df_inventory], axis=0, join='outer').reset_index(drop=True)\n",
+    "    df_inventory = pm2.pm2io.nir_add_unit_information(df_inventory,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "    \n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_inventory = df_inventory.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "\n",
+    "    # extract category from tuple\n",
+    "    df_inventory_long[\"orig_cat_name\"] = df_inventory_long[\"orig_cat_name\"].str[0] \n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_inventory_long[\"category\"] = df_inventory_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # replace cat names by codes in col \"category\"\n",
+    "    # first the manual replacements\n",
+    "    # TODO: move this to config section\n",
+    "    inv_conf[\"cat_codes_manual\"]['main'] = {\n",
+    "            'Éléments pour mémoire': 'MEMO',\n",
+    "            'Soutes internationales': 'M.BK',\n",
+    "            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',\n",
+    "            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "            'Total des émissions et absorptions nationales': \"0\",\n",
+    "            '2A5: Autre': '2A5', \n",
+    "        }\n",
+    "    df_inventory_long[\"category\"] = \\\n",
+    "        df_inventory_long[\"category\"].replace(inv_conf[\"cat_codes_manual\"]['main'])  \n",
+    "\n",
+    "    df_inventory_long[\"category\"] = df_inventory_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    \n",
+    "    # then the regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_inventory_long[\"category\"] = \\\n",
+    "        df_inventory_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "\n",
+    "    df_inventory_long = df_inventory_long.reset_index(drop=True)\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    df_inventory_long[\"data\"] = df_inventory_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_inventory_long[\"data\"] = df_inventory_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_inventory_long.columns = df_inventory_long.columns.map(str)\n",
+    "    df_inventory_long = df_inventory_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_all_dict[page] = df_inventory_long\n",
+    "\n",
+    "df_all = pd.concat([df_all_dict['110'], df_all_dict['111'], df_all_dict['112'], df_all_dict['113']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_all_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_all,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['main'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "f1a4535e-3abc-45d0-9309-fd7991b1cb95",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.1, CO2, 2010.\n",
+      "[422.474]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 2, SO2, 1990.\n",
+      "[0.097]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.3.a.i, N2O, 2000.\n",
+      "[6.e-05]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 2.H.2, NMVOC, 2019.\n",
+      "[2.506]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.1, CH4, 2019.\n",
+      "[0.0011]\n",
+      "Value matches expected value.\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Test individual values from the tables ###\n",
+    "# TODO and note: this function is work in progress\n",
+    "# Use assert statements and print error message\n",
+    "# with category, entity, year, expected value and actual value\n",
+    "\n",
+    "### Test individual values from the tables ###\n",
+    "def assert_individual_value(\n",
+    "    df,\n",
+    "    category_column,\n",
+    "    entity_column,\n",
+    "    category,\n",
+    "    entity,\n",
+    "    year,\n",
+    "    expected_value\n",
+    "):\n",
+    "    arr = df.loc[(df[category_column] == category) & (df[entity_column] == entity), year].values\n",
+    "    print(arr)\n",
+    "    if len(arr) > 1:\n",
+    "        print(f\"More than one value found for {category}, {entity}, {year}!\")\n",
+    "\n",
+    "    # TODO: It looks like this will be true when the value equals 0\n",
+    "    if not arr:\n",
+    "        print((f\"No value found for {category}, {entity}, {year}!\"))\n",
+    "            \n",
+    "    if not arr[0] == expected_value:\n",
+    "        print(f\"Expected value {expected_value}, actual value is {arr[0]}\")\n",
+    "\n",
+    "    if arr[0] == expected_value:\n",
+    "        print(\"Value matches expected value.\")\n",
+    "\n",
+    "    return\n",
+    "\n",
+    "\n",
+    "test_cases = {\n",
+    "    \"1\" : {\n",
+    "        \"category\" : \"1.A.1\",\n",
+    "        'entity' : \"CO2\",\n",
+    "        \"year\" : \"2010\",\n",
+    "        \"expected_value\" : 422.474,\n",
+    "    },\n",
+    "    \"2\" : {\n",
+    "        \"category\" : \"2\",\n",
+    "        'entity' : \"SO2\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 0.097,\n",
+    "    },\n",
+    "    \"3\" : {\n",
+    "        \"category\" : \"1.A.3.a.i\",\n",
+    "        'entity' : \"N2O\",\n",
+    "        \"year\" : \"2000\",\n",
+    "        \"expected_value\" : 6e-5,\n",
+    "    },\n",
+    "    '4' : {\n",
+    "        \"category\" : \"2.H.2\",\n",
+    "        'entity' : \"NMVOC\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 2.506,\n",
+    "    },\n",
+    "    '5' : {\n",
+    "        \"category\" : \"1.A.1\",\n",
+    "        'entity' : \"CH4\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 0.0011,\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "for key in test_cases.keys():\n",
+    "    print(\"-\"*50)\n",
+    "    print(f\"Testing combination {test_cases[key][\"category\"]}, {test_cases[key][\"entity\"]}, {test_cases[key][\"year\"]}.\")\n",
+    "    assert_individual_value(\n",
+    "                    df = df_all_IF,\n",
+    "                    category_column = \"category (IPCC1996_2006_GIN_Inv)\",\n",
+    "                    entity_column = \"entity\",\n",
+    "                    category = test_cases[key][\"category\"],\n",
+    "                    entity = test_cases[key][\"entity\"],\n",
+    "                    year = test_cases[key][\"year\"],\n",
+    "                    expected_value = test_cases[key][\"expected_value\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "23258414-84b2-4a99-8f48-f471f5ebf75a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Unique values in column source\n",
+      "['GIN-GHG-Inventory']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column scenario (PRIMAP)\n",
+      "['BUR1']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column provenance\n",
+      "['measured']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column area (ISO3)\n",
+      "['GIN']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column entity\n",
+      "['CH4' 'CO' 'CO2' 'HFCS (AR4GWP100)' 'N2O' 'NMVOC' 'NOx'\n",
+      " 'PFCS (AR4GWP100)' 'SF6' 'SO2']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column unit\n",
+      "['Gg CH4 / yr' 'Gg CO / yr' 'Gg CO2 / yr' 'Gg N2O / yr' 'Gg NMVOC / yr'\n",
+      " 'Gg NOx / yr' 'Gg SF6 / yr' 'Gg SO2 / yr']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column category (IPCC1996_2006_GIN_Inv)\n",
+      "['0' '1' '1.A' '1.A.1' '1.A.2' '1.A.3' '1.A.4' '1.A.5' '1.B' '1.C' '2'\n",
+      " '2.A' '2.A.1' '2.A.2' '2.A.3' '2.A.4' '2.A.5' '2.B' '2.C' '2.C.1' '2.C.2'\n",
+      " '2.C.3' '2.C.4' '2.C.5' '2.C.6' '2.C.7' '2.D' '2.D.1' '2.D.2' '2.D.3'\n",
+      " '2.D.4' '2.E' '2.F' '2.F.1' '2.F.2' '2.F.3' '2.F.4' '2.F.5' '2.F.6' '2.G'\n",
+      " '2.H' '2.H.1' '2.H.2' '2.H.3' '3' '3.A' '3.A.1' '3.A.2' '3.B' '3.B.1'\n",
+      " '3.B.2' '3.B.3' '3.B.4' '3.B.5' '3.B.6' '3.C' '3.C.1' '3.C.2' '3.C.3'\n",
+      " '3.C.4' '3.C.5' '3.C.6' '3.C.7' '3.C.8' '3.D' '3.D.1' '3.D.2' '4' '4.A'\n",
+      " '4.B' '4.C' '4.D' '4.E' '5' 'M.BK' 'M.BK.A' 'M.BK.M' 'M.MULTIOP']\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 1990\n",
+      "[ 6.5202000e+01  6.4650000e+00  3.2000000e-02  6.0000000e-03\n",
+      "  2.5000000e-02  6.4020000e+00  0.0000000e+00            nan\n",
+      "  5.6987000e+01  5.5634000e+01  5.3796000e+01  1.8381000e+00\n",
+      "  1.3530000e+00  1.7500000e+00  1.0290000e+00  2.1700000e-01\n",
+      "  5.0400000e-01  5.0000000e-06  1.6241700e+02  1.2418200e+02\n",
+      "  1.5800000e-01  2.8000000e-02  6.2990000e+00  1.1769600e+02\n",
+      "  3.8236000e+01 -1.6177575e+04  1.3104990e+03  8.1425700e+02\n",
+      "  1.5300000e+02  3.2603400e+02  1.7208000e+01  1.2779000e+01\n",
+      " -1.7502977e+04 -1.7499771e+04 -1.7508456e+04  6.9024000e+01\n",
+      " -6.0339000e+01  2.2800000e-01 -3.4340000e+00  2.1240000e+00\n",
+      "  7.1900000e-01  2.5770000e+00  1.1800000e-01  1.0000000e-03\n",
+      "  3.0000000e-02  8.0000000e-02  1.1344000e+01  2.1900000e+00\n",
+      "  1.2400000e-01  2.0660000e+00  2.6800000e-01  3.4000000e-03\n",
+      "  2.6500000e-01  2.0000000e-05  1.4312200e+01  1.3193000e+01\n",
+      "  5.2800000e-02  1.0000000e-02  1.2329000e+00  1.1897200e+01\n",
+      "  1.1192000e+00  1.0920000e+01  8.6260000e+00  2.1130000e+00\n",
+      "  3.9800000e-01  3.8580000e+00  2.2570000e+00  2.2940000e+00\n",
+      "  9.7000000e-02]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2000\n",
+      "[ 1.1998100e+02  6.4890000e+00  2.4000000e-02  1.8000000e-02\n",
+      "  1.5000000e-01  6.2970000e+00  0.0000000e+00            nan\n",
+      "  1.1056800e+02  1.0791100e+02  1.0429800e+02  3.6134500e+00\n",
+      "  2.6570000e+00  2.9250000e+00  2.0540000e+00  2.0900000e-01\n",
+      "  6.6200000e-01  1.5000000e-05  2.5243200e+02  1.7734200e+02\n",
+      "  1.1800000e-01  8.4000000e-02  3.6605000e+01  1.4053500e+02\n",
+      "  7.5090000e+01 -1.3893667e+04  1.8410300e+03  6.0736800e+02\n",
+      "  4.6044700e+02  7.6103000e+02  1.2185000e+01  1.5640000e+01\n",
+      " -1.5752375e+04 -1.5749970e+04 -1.5766453e+04  1.6484000e+01\n",
+      "  2.6800000e-01 -2.6730000e+00  2.0380000e+00  2.1580000e+00\n",
+      "  2.1500000e-01  4.7480000e+00  1.2500000e-01  5.0000000e-03\n",
+      "  4.0000000e-03  4.9000000e-02  6.8000000e-02  4.2690000e+00\n",
+      "  2.4300000e-01  4.0260000e+00  3.5400000e-01  3.0000000e-03\n",
+      "  3.5100000e-01  6.0000000e-05  1.8179000e+01  1.6697000e+01\n",
+      "  3.9000000e-02  3.0000000e-02  6.9480000e+00  9.6800000e+00\n",
+      "  1.4820000e+00  2.5060000e+00  1.7676000e+01  1.3170000e+01\n",
+      "  1.5740000e+00  1.1960000e+00  7.9620000e+00  2.4380000e+00\n",
+      "  4.5050000e+00  1.5600000e-01]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2010\n",
+      "[ 1.9700000e+02  4.8490000e+00  1.6000000e-02  2.8000000e-02\n",
+      "  3.1600000e-01  4.4890000e+00  0.0000000e+00            nan\n",
+      "  1.8761700e+02  1.8676900e+02  1.8045400e+02  6.3150000e+00\n",
+      "  8.4800000e-01  4.5340000e+00  3.3230000e+00  3.2000000e-01\n",
+      "  8.9200000e-01  3.0000000e-04  1.9571300e+02  1.7174700e+02\n",
+      "  8.2000000e-02  1.3300000e-01  7.7000000e+01  9.4532000e+01\n",
+      "  2.3966000e+01 -1.0691033e+04  2.3437780e+03  4.2247400e+02\n",
+      "  7.0899600e+02  1.2051170e+03  7.1920000e+00  1.9142000e+01\n",
+      " -1.3057077e+04 -1.3052876e+04 -1.3040518e+04  8.9270000e+00\n",
+      " -2.1284000e+01 -4.5210000e+00  3.1240000e+00  3.6900000e+01\n",
+      "  3.0400000e+00  7.5620000e+00  1.1900000e-01  3.0000000e-03\n",
+      "  6.0000000e-03  6.0000000e-02  5.1000000e-02  6.9670000e+00\n",
+      "  7.7000000e-02  6.8900000e+00  4.7500000e-01  5.0000000e-03\n",
+      "  4.7000000e-01  1.0000000e-03  2.7269600e+01  2.1977000e+01\n",
+      "  2.7300000e-02  4.6100000e-02  1.4539500e+01  7.3641000e+00\n",
+      "  5.2926000e+00  3.3038000e+00  1.9888000e+00  1.7748000e+01\n",
+      "  1.6310000e+01  1.0920000e+00  1.8330000e+00  1.1701000e+01\n",
+      "  1.6840000e+00  1.4380000e+00  1.0400000e-01]\n",
+      "--------------------------------------------------\n",
+      "Unique values in column 2019\n",
+      "[ 3.120340e+02  5.866000e+00  1.100000e-03  3.270000e-02  4.584000e-01\n",
+      "  5.374200e+00  0.000000e+00           nan  2.995030e+02  2.985330e+02\n",
+      "  2.882390e+02  1.029400e+01  9.700000e-01  6.665000e+00  5.170000e+00\n",
+      "  3.570000e-01  1.138000e+00  5.000000e-04  2.626700e+02  2.244300e+02\n",
+      "  5.400000e-03  1.391000e-01  1.121100e+02  1.121800e+02  3.823600e+01\n",
+      " -9.360370e+03  3.037736e+03  2.735500e+01  8.249530e+02  2.168661e+03\n",
+      "  1.676700e+01  1.114700e+02  6.984600e+01  2.097200e+01  2.065200e+01\n",
+      " -1.251307e+04 -1.251512e+04 -1.254581e+04  3.068900e+01  2.053000e+00\n",
+      "  3.491000e+00  6.619700e+01  6.141000e+00  1.211700e+01  1.770000e-01\n",
+      "  2.000000e-04  6.500000e-03  1.090000e-01  6.100000e-02  1.134400e+01\n",
+      "  8.900000e-02  1.125500e+01  5.970000e-01  6.000000e-03  5.910000e-01\n",
+      "  2.000000e-03  3.632300e+01  3.004400e+01  5.400000e-02  2.124900e+01\n",
+      "  8.739000e+00  6.279000e+00  3.773000e+00  2.506000e+00  2.822000e+01\n",
+      "  2.592000e+01  7.200000e-02  2.163000e+00  2.168000e+01  2.009000e+00\n",
+      "  2.294000e+00  3.280000e-01]\n"
+     ]
+    }
+   ],
+   "source": [
+    "### check data for errors ###\n",
+    "# print a few things to see if it looks \"normal\"\n",
+    "for c in df_all_IF.columns:\n",
+    "    print('-'*50)\n",
+    "    print(f\"Unique values in column {c}\")\n",
+    "    print(df_all_IF[c].unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "07812254-fb73-4cb5-ae45-a96a2f2273d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-21 16:58:31.197\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78]], resulting in size 7,800.\u001b[0m\n",
+      "\u001b[32m2024-03-21 16:58:31.323\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88d4e68e-f1f4-4c7d-b710-c749296a16ca",
+   "metadata": {},
+   "source": [
+    "## 2. Read in sector tables for energy - pages 116, 117, 118, 119"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "251c3495-8506-4f43-9a97-094b5fb16947",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 116.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 117.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 118.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 119.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1.a</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1.a.i</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>373</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.A.5.b.iii</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>374</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.A.5.c</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>375</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>1.B</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>376</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>M.BK.M</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>377</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>M.MULTIOP</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>378 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "373  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "374  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "375  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "376  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "377  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)   1990   2000   2010   2019  \n",
+       "0    Gg CH4 / yr                                1  6.465  6.489  4.849  5.821  \n",
+       "1    Gg CH4 / yr                              1.A  6.465  6.489  4.849  5.821  \n",
+       "2    Gg CH4 / yr                            1.A.1  0.032  0.024  0.016  0.001  \n",
+       "3    Gg CH4 / yr                          1.A.1.a  0.032  0.024  0.016  0.001  \n",
+       "4    Gg CH4 / yr                        1.A.1.a.i  0.032  0.024  0.016  0.001  \n",
+       "..           ...                              ...    ...    ...    ...    ...  \n",
+       "373  Gg SO2 / yr                      1.A.5.b.iii    NaN    NaN    NaN    NaN  \n",
+       "374  Gg SO2 / yr                          1.A.5.c    NaN    NaN    NaN    NaN  \n",
+       "375  Gg SO2 / yr                              1.B    NaN    NaN    NaN    NaN  \n",
+       "376  Gg SO2 / yr                           M.BK.M    NaN    NaN    NaN    NaN  \n",
+       "377  Gg SO2 / yr                        M.MULTIOP    NaN    NaN    NaN    NaN  \n",
+       "\n",
+       "[378 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pages = ['116', '117', '118', '119']\n",
+    "df_energy_dict = {}\n",
+    "for page in pages:\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "        str(input_folder / pdf_file),\n",
+    "        pages=page,\n",
+    "        flavor=\"lattice\",\n",
+    "        split_text=True\n",
+    "        )\n",
+    "    \n",
+    "    print(\"Reading complete.\")\n",
+    "\n",
+    "    # cut last two lines of second table to ignore additional information regarding biomass for energy production \n",
+    "    df_energy_year = pd.concat([tables_inventory_original[0].df[2:],\n",
+    "                                tables_inventory_original[1].df[3:-2]],\n",
+    "                                axis=0,\n",
+    "                                join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    \n",
+    "    # drop duplicate lines - 1.A.3.d.i / 1.A.3.a.i / 1.A.5.c\n",
+    "    # TODO: better to find the index of the line and then drop it by the index\n",
+    "    df_energy_year = df_energy_year.drop(index=[27, 32, 50])  \n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_energy\"], inv_conf[\"unit_energy\"]])\n",
+    "\n",
+    "    df_energy_year = pd.concat([df_header, df_energy_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "    \n",
+    "    df_energy_year = pm2.pm2io.nir_add_unit_information(df_energy_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "    \n",
+    "    print(\"Added unit information.\")\n",
+    "    # set index\n",
+    "    df_energy_year = df_energy_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_energy_year_long = pm2.pm2io.nir_convert_df_to_long(df_energy_year, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    # extract from tuple\n",
+    "    df_energy_year_long[\"orig_cat_name\"] = df_energy_year_long[\"orig_cat_name\"].str[0] \n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # replace individual categories\n",
+    "    # TODO: move to config section\n",
+    "    inv_conf[\"cat_codes_manual\"]['energy'] = {\n",
+    "            'International Bunkers': 'MEMO',\n",
+    "            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',\n",
+    "            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "        }\n",
+    "\n",
+    "    # replace cat names by codes in col \"category\"\n",
+    "    # first the manual replacements\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"category\"].str.replace('\\n' ,'')\n",
+    "    df_energy_year_long[\"category\"] = \\\n",
+    "        df_energy_year_long[\"category\"].replace(inv_conf[\"cat_codes_manual\"]['energy'])\n",
+    "\n",
+    "    df_energy_year_long[\"category\"] = df_energy_year_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    \n",
+    "    inv_conf[\"cat_code_regexp\"] = r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "\n",
+    "    # then the regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_energy_year_long[\"category\"] = \\\n",
+    "        df_energy_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "\n",
+    "    df_energy_year_long = df_energy_year_long.reset_index(drop=True)\n",
+    "\n",
+    "    \n",
+    "    df_energy_year_long[\"data\"] = df_energy_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_energy_year_long[\"data\"] = df_energy_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_energy_year_long.columns = df_energy_year_long.columns.map(str)\n",
+    "    df_energy_year_long = df_energy_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_energy_dict[page] = df_energy_year_long\n",
+    "\n",
+    "df_energy = pd.concat([df_energy_dict['116'], df_energy_dict['117'], df_energy_dict['118'], df_energy_dict['119']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_energy_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_energy,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['energy'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_energy_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "64fa29dc-f62b-4010-bfed-8cd588675475",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.2.k, CH4, 1990.\n",
+      "[3.e-05]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.4.c.i, CO, 1990.\n",
+      "[0.0016]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.3.a.i, NMVOC, 2000.\n",
+      "[0.0002]\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1, SO2, 2010.\n",
+      "[0.]\n",
+      "No value found for 1, SO2, 2010!\n",
+      "Value matches expected value.\n",
+      "--------------------------------------------------\n",
+      "Testing combination 1.A.2.k, N2O, 2019.\n",
+      "[7.e-06]\n",
+      "Value matches expected value.\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_cases = {\n",
+    "    \"1\" : {\n",
+    "        \"category\" : \"1.A.2.k\",\n",
+    "        'entity' : \"CH4\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 3e-05,\n",
+    "    },\n",
+    "    \"2\" : {\n",
+    "        \"category\" : \"1.A.4.c.i\",\n",
+    "        'entity' : \"CO\",\n",
+    "        \"year\" : \"1990\",\n",
+    "        \"expected_value\" : 0.0016,\n",
+    "    },\n",
+    "    \"3\" : {\n",
+    "        \"category\" : \"1.A.3.a.i\",\n",
+    "        'entity' : \"NMVOC\",\n",
+    "        \"year\" : \"2000\",\n",
+    "        \"expected_value\" : 0.0002,\n",
+    "    },\n",
+    "    '4' : {\n",
+    "        \"category\" : \"1\",\n",
+    "        'entity' : \"SO2\",\n",
+    "        \"year\" : \"2010\",\n",
+    "        \"expected_value\" : 0,\n",
+    "    },\n",
+    "    '5' : {\n",
+    "        \"category\" : \"1.A.2.k\",\n",
+    "        'entity' : \"N2O\",\n",
+    "        \"year\" : \"2019\",\n",
+    "        \"expected_value\" : 7e-06,\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "for key in test_cases.keys():\n",
+    "    print(\"-\"*50)\n",
+    "    print(f\"Testing combination {test_cases[key][\"category\"]}, {test_cases[key][\"entity\"]}, {test_cases[key][\"year\"]}.\")\n",
+    "    assert_individual_value(\n",
+    "                    df = df_energy_IF,\n",
+    "                    category_column = \"category (IPCC1996_2006_GIN_Inv)\",\n",
+    "                    entity_column = \"entity\",\n",
+    "                    category = test_cases[key][\"category\"],\n",
+    "                    entity = test_cases[key][\"entity\"],\n",
+    "                    year = test_cases[key][\"year\"],\n",
+    "                    expected_value = test_cases[key][\"expected_value\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "bcf727f7-3474-4f2e-9bcb-ebdd140a14c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-21 17:25:29.863\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54]], resulting in size 2,646.\u001b[0m\n",
+      "\u001b[32m2024-03-21 17:25:29.940\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d557a318-ea3f-44ec-9187-c05da423fbca",
+   "metadata": {},
+   "source": [
+    "# 3. Read in LULUCF table - pages 124, 125, 126, 127"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "b4d117f0-6bfc-468f-b9f2-f66d5eaf8f1a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table from page 124.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 125.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 126.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Reading table from page 127.\n",
+      "Reading complete.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3</td>\n",
+       "      <td>56.987</td>\n",
+       "      <td>110.568</td>\n",
+       "      <td>187.617</td>\n",
+       "      <td>299.503</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A</td>\n",
+       "      <td>55.634</td>\n",
+       "      <td>107.911</td>\n",
+       "      <td>186.769</td>\n",
+       "      <td>298.533</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1</td>\n",
+       "      <td>53.796</td>\n",
+       "      <td>104.298</td>\n",
+       "      <td>180.454</td>\n",
+       "      <td>288.239</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1.a</td>\n",
+       "      <td>49.050</td>\n",
+       "      <td>94.967</td>\n",
+       "      <td>161.753</td>\n",
+       "      <td>256.319</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>3.A.1.a.i</td>\n",
+       "      <td>10.488</td>\n",
+       "      <td>17.802</td>\n",
+       "      <td>27.091</td>\n",
+       "      <td>31.905</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>469</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.C.7</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>470</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.C.8</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>471</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>472</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D.1</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>473</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>NOx</td>\n",
+       "      <td>Gg NOx / yr</td>\n",
+       "      <td>3.D.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>474 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "469  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "470  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "471  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "472  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "473  GIN-GHG-Inventory              BUR1   measured         GIN    NOx   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)    1990     2000     2010  \\\n",
+       "0    Gg CH4 / yr                                3  56.987  110.568  187.617   \n",
+       "1    Gg CH4 / yr                              3.A  55.634  107.911  186.769   \n",
+       "2    Gg CH4 / yr                            3.A.1  53.796  104.298  180.454   \n",
+       "3    Gg CH4 / yr                          3.A.1.a  49.050   94.967  161.753   \n",
+       "4    Gg CH4 / yr                        3.A.1.a.i  10.488   17.802   27.091   \n",
+       "..           ...                              ...     ...      ...      ...   \n",
+       "469  Gg NOx / yr                            3.C.7   0.000    0.000    0.000   \n",
+       "470  Gg NOx / yr                            3.C.8   0.000    0.000    0.000   \n",
+       "471  Gg NOx / yr                              3.D   0.000    0.000    0.000   \n",
+       "472  Gg NOx / yr                            3.D.1   0.000    0.000    0.000   \n",
+       "473  Gg NOx / yr                            3.D.2   0.000    0.000    0.000   \n",
+       "\n",
+       "        2019  \n",
+       "0    299.503  \n",
+       "1    298.533  \n",
+       "2    288.239  \n",
+       "3    256.319  \n",
+       "4     31.905  \n",
+       "..       ...  \n",
+       "469    0.000  \n",
+       "470    0.000  \n",
+       "471    0.000  \n",
+       "472    0.000  \n",
+       "473    0.000  \n",
+       "\n",
+       "[474 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pages = ['124', '125', '126', '127']\n",
+    "df_lulucf_dict = {}\n",
+    "for page in pages:\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table from page {page}.\")\n",
+    "    \n",
+    "    tables_inventory_original = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    "    )\n",
+    "    print(\"Reading complete.\")\n",
+    "\n",
+    "    if page == '127':\n",
+    "        # table on page 127 has one extra row at the top\n",
+    "        # and one extra category 3.A.1.j\n",
+    "        df_lulucf_year = tables_inventory_original[0].df[3:]\n",
+    "        # rename duplicate categories in tables\n",
+    "        # TODO move to config section\n",
+    "        replace_categories = [(19, \"3.A.2.a.i - Vaches laitières\"),\n",
+    "                              (20, \"3.A.2.a.ii - Autres bovins\"),\n",
+    "                              (21, \"3.A.2.b - Buffle\"),\n",
+    "                              (22, \"3.A.2.c - Ovins\"),\n",
+    "                              (23, \"3.A.2.d - Caprins\"),\n",
+    "                              (24, \"3.A.2.e - Chameaux\"),\n",
+    "                              (25, \"3.A.2.f - Chevaux\"),\n",
+    "                              (26, \"3.A.2.g - Mules et ânes\"),\n",
+    "                              (27, \"3.A.2.h - Porcins\"),\n",
+    "                              (28, \"3.A.2.i - Volailles\"),\n",
+    "                              (29, \"3.A.2.j - Autres (préciser)\"),]\n",
+    "        for index, category_name in  replace_categories:\n",
+    "            df_lulucf_year.at[index, 0] = category_name\n",
+    "    else:\n",
+    "        # cut first two lines\n",
+    "        df_lulucf_year = tables_inventory_original[0].df[2:] \n",
+    "\n",
+    "        # TODO move to config section\n",
+    "        replace_categories = [(17, \"3.A.2.a.i - Vaches laitières\"),\n",
+    "                              (18, \"3.A.2.a.ii - Autres bovins\"),\n",
+    "                              (19, \"3.A.2.b - Buffle\"),\n",
+    "                              (20, \"3.A.2.c - Ovins\"),\n",
+    "                              (21, \"3.A.2.d - Caprins\"),\n",
+    "                              (22, \"3.A.2.e - Chameaux\"),\n",
+    "                              (23, \"3.A.2.f - Chevaux\"),\n",
+    "                              (24, \"3.A.2.g - Mules et ânes\"),\n",
+    "                              (25, \"3.A.2.h - Porcins\"),\n",
+    "                              (26, \"3.A.2.i - Volailles\"),]\n",
+    "        for index, category_name in  replace_categories:\n",
+    "            df_lulucf_year.at[index, 0] = category_name\n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_lulucf\"], inv_conf[\"unit_lulucf\"]])\n",
+    "\n",
+    "    df_lulucf_year = pd.concat([df_header, df_lulucf_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    df_lulucf_year = pm2.pm2io.nir_add_unit_information(df_lulucf_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "\n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_lulucf_year = df_lulucf_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_lulucf_year_long = pm2.pm2io.nir_convert_df_to_long(df_lulucf_year, inv_conf[\"year\"][page],\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    df_lulucf_year_long[\"orig_cat_name\"] = df_lulucf_year_long[\"orig_cat_name\"].str[0] # extract from tuple\n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_lulucf_year_long[\"category\"] = df_lulucf_year_long[\"orig_cat_name\"]\n",
+    "   \n",
+    "    # regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_lulucf_year_long[\"category\"] = \\\n",
+    "        df_lulucf_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_lulucf_year_long = df_lulucf_year_long.reset_index(drop=True)\n",
+    "    \n",
+    "    df_lulucf_year_long[\"data\"] = df_lulucf_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_lulucf_year_long[\"data\"] = df_lulucf_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_lulucf_year_long.columns = df_lulucf_year_long.columns.map(str)\n",
+    "    df_lulucf_year_long = df_lulucf_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_lulucf_dict[page] = df_lulucf_year_long\n",
+    "\n",
+    "df_lulucf = pd.concat([df_lulucf_dict['124'], df_lulucf_dict['125'], df_lulucf_dict['126'], df_lulucf_dict['127']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_lulucf_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_lulucf,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['lulucf'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_lulucf_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "8d132ea2-655a-4363-9171-b81904a7d6d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:22:15.333\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79]], resulting in size 2,844.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:22:15.408\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_lulucf = pm2.pm2io.from_interchange_format(df_lulucf_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a99c689e-1f26-42d5-8974-194373ce26f6",
+   "metadata": {},
+   "source": [
+    "# 3. Read in Waste tables - pages 128, 130"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "fcf17dba-6af4-400f-9ec3-b5dd5b1b0a82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# There are three tables for three years on page 128\n",
+    "# and another tabel on page 130\n",
+    "\n",
+    "# read three tables\n",
+    "page = '128'\n",
+    "tables_inventory_original_128 = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    ")\n",
+    "\n",
+    "# read last table\n",
+    "page = '130'\n",
+    "tables_inventory_original_130 = camelot.read_pdf(\n",
+    "    str(input_folder / pdf_file),\n",
+    "    pages=page,\n",
+    "    flavor=\"lattice\",\n",
+    "    split_text=True\n",
+    ")\n",
+    "\n",
+    "# save to dict\n",
+    "df_waste_years = {\n",
+    "    '1990' : tables_inventory_original_128[0].df,\n",
+    "    '2000' : tables_inventory_original_128[1].df,\n",
+    "    '2010' : tables_inventory_original_128[2].df,\n",
+    "    '2019' : tables_inventory_original_130[0].df,\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "4e0afb6e-db8b-41ae-b02d-e4a5d54ea5ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Processing table for 1990.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2000.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2010.\n",
+      "Added unit information.\n",
+      "---------------------------------------------\n",
+      "Processing table for 2019.\n",
+      "Added unit information.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1.750</td>\n",
+       "      <td>2.925</td>\n",
+       "      <td>4.534</td>\n",
+       "      <td>6.665</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A</td>\n",
+       "      <td>1.029</td>\n",
+       "      <td>2.054</td>\n",
+       "      <td>3.323</td>\n",
+       "      <td>5.170</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.2</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>4.A.3</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>86</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.C.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>87</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>88</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D.1</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.D.2</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>90</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>SO2</td>\n",
+       "      <td>Gg SO2 / yr</td>\n",
+       "      <td>4.E</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>91 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4   GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                ...               ...        ...         ...    ...   \n",
+       "86  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "87  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "88  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "89  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "90  GIN-GHG-Inventory              BUR1   measured         GIN    SO2   \n",
+       "\n",
+       "           unit category (IPCC1996_2006_GIN_Inv)   1990   2000   2010   2019  \n",
+       "0   Gg CH4 / yr                                4  1.750  2.925  4.534  6.665  \n",
+       "1   Gg CH4 / yr                              4.A  1.029  2.054  3.323  5.170  \n",
+       "2   Gg CH4 / yr                            4.A.1    NaN    NaN    NaN    NaN  \n",
+       "3   Gg CH4 / yr                            4.A.2    NaN    NaN    NaN    NaN  \n",
+       "4   Gg CH4 / yr                            4.A.3    NaN    NaN    NaN    NaN  \n",
+       "..          ...                              ...    ...    ...    ...    ...  \n",
+       "86  Gg SO2 / yr                            4.C.2  0.000  0.000  0.000  0.000  \n",
+       "87  Gg SO2 / yr                              4.D  0.000  0.000  0.000  0.000  \n",
+       "88  Gg SO2 / yr                            4.D.1  0.000  0.000  0.000  0.000  \n",
+       "89  Gg SO2 / yr                            4.D.2  0.000  0.000  0.000  0.000  \n",
+       "90  Gg SO2 / yr                              4.E  0.000  0.000  0.000  0.000  \n",
+       "\n",
+       "[91 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_waste_dict = {}\n",
+    "for year in df_waste_years.keys():\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Processing table for {year}.\")\n",
+    "\n",
+    "    df_waste_year = df_waste_years[year][2:]\n",
+    "    \n",
+    "    # add header and unit\n",
+    "    df_header = pd.DataFrame([inv_conf[\"header_waste\"], inv_conf[\"unit_waste\"]])\n",
+    "\n",
+    "    df_waste_year = pd.concat([df_header, df_waste_year], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "    df_waste_year = pm2.pm2io.nir_add_unit_information(df_waste_year,\n",
+    "                                                  unit_row=inv_conf[\"unit_row\"],\n",
+    "                                                  entity_row=inv_conf[\"entity_row\"],\n",
+    "                                                  regexp_entity=\".*\",\n",
+    "                                                  regexp_unit=\".*\",\n",
+    "                                                  default_unit=\"Gg\")\n",
+    "\n",
+    "    print(\"Added unit information.\")\n",
+    "    \n",
+    "    # set index\n",
+    "    df_waste_year = df_waste_year.set_index(inv_conf[\"index_cols\"])\n",
+    "\n",
+    "    # convert to long format\n",
+    "    df_waste_year_long = pm2.pm2io.nir_convert_df_to_long(df_waste_year, year,\n",
+    "                                                     inv_conf[\"header_long\"])\n",
+    "    \n",
+    "    df_waste_year_long[\"orig_cat_name\"] = df_waste_year_long[\"orig_cat_name\"].str[0]\n",
+    "\n",
+    "    # prep for conversion to PM2 IF and native format\n",
+    "    # make a copy of the categories row\n",
+    "    df_waste_year_long[\"category\"] = df_waste_year_long[\"orig_cat_name\"]\n",
+    "\n",
+    "    # regex replacements\n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_waste_year_long[\"category\"] = \\\n",
+    "        df_waste_year_long[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_waste_year_long = df_waste_year_long.reset_index(drop=True)\n",
+    "\n",
+    "    df_waste_year_long[\"category\"] = df_waste_year_long[\"category\"].str.replace(\".\", \"\")\n",
+    "    df_waste_year_long[\"data\"] = df_waste_year_long[\"data\"].str.replace(\",\", \".\")\n",
+    "    df_waste_year_long[\"data\"] = df_waste_year_long[\"data\"].str.replace(\"NE1\", \"NE\")\n",
+    "\n",
+    "    # make sure all col headers are str\n",
+    "    df_waste_year_long.columns = df_waste_year_long.columns.map(str)\n",
+    "    df_waste_year_long = df_waste_year_long.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_waste_dict[year] = df_waste_year_long\n",
+    "\n",
+    "df_waste = pd.concat([df_waste_dict['1990'], df_waste_dict['2000'], df_waste_dict['2010'], df_waste_dict['2019']],\n",
+    "                      axis=0,\n",
+    "                      join='outer').reset_index(drop=True)\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "df_waste_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_waste,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['waste'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_waste_IF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "6628eacb-8a24-415b-a42e-04e929976f83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:27:11.859\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13]], resulting in size 637.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:27:11.898\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_waste = pm2.pm2io.from_interchange_format(df_waste_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba512153-1c65-4568-9bae-817fbf9cc9b3",
+   "metadata": {},
+   "source": [
+    "# 4. Read in trend tables - pages 131 - 137"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "0e71c7b2-c301-4048-8b92-c9fc58a2501f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "Reading table for page 131 and entity CO2.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 132 and entity CH4.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 133 and entity N2O.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 134 and entity NOx.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 135 and entity CO.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 136 and entity NMVOCs.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "---------------------------------------------\n",
+      "Reading table for page 137 and entity SO2.\n",
+      "Reading complete.\n",
+      "Created category codes.\n",
+      "Converted to long format.\n",
+      "Converting to interchange format.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>scenario (PRIMAP)</th>\n",
+       "      <th>provenance</th>\n",
+       "      <th>area (ISO3)</th>\n",
+       "      <th>entity</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>category (IPCC1996_2006_GIN_Inv)</th>\n",
+       "      <th>1990</th>\n",
+       "      <th>1995</th>\n",
+       "      <th>2000</th>\n",
+       "      <th>2005</th>\n",
+       "      <th>2010</th>\n",
+       "      <th>2015</th>\n",
+       "      <th>2018</th>\n",
+       "      <th>2019</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>0</td>\n",
+       "      <td>65.202</td>\n",
+       "      <td>93.368</td>\n",
+       "      <td>119.981</td>\n",
+       "      <td>152.272</td>\n",
+       "      <td>196.057</td>\n",
+       "      <td>253.025</td>\n",
+       "      <td>296.416</td>\n",
+       "      <td>312.034</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>7.066</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>5.984</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.360</td>\n",
+       "      <td>5.931</td>\n",
+       "      <td>5.866</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A</td>\n",
+       "      <td>6.465</td>\n",
+       "      <td>7.066</td>\n",
+       "      <td>6.489</td>\n",
+       "      <td>5.984</td>\n",
+       "      <td>4.849</td>\n",
+       "      <td>5.360</td>\n",
+       "      <td>5.931</td>\n",
+       "      <td>5.866</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.1</td>\n",
+       "      <td>0.032</td>\n",
+       "      <td>0.027</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.020</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.002</td>\n",
+       "      <td>0.005</td>\n",
+       "      <td>0.001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CH4</td>\n",
+       "      <td>Gg CH4 / yr</td>\n",
+       "      <td>1.A.2</td>\n",
+       "      <td>0.006</td>\n",
+       "      <td>0.012</td>\n",
+       "      <td>0.018</td>\n",
+       "      <td>0.023</td>\n",
+       "      <td>0.028</td>\n",
+       "      <td>0.024</td>\n",
+       "      <td>0.026</td>\n",
+       "      <td>0.033</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>5</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>152</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK</td>\n",
+       "      <td>0.719</td>\n",
+       "      <td>1.438</td>\n",
+       "      <td>2.158</td>\n",
+       "      <td>19.529</td>\n",
+       "      <td>36.900</td>\n",
+       "      <td>21.840</td>\n",
+       "      <td>51.718</td>\n",
+       "      <td>66.197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>153</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK.A</td>\n",
+       "      <td>0.719</td>\n",
+       "      <td>1.438</td>\n",
+       "      <td>2.158</td>\n",
+       "      <td>19.529</td>\n",
+       "      <td>36.900</td>\n",
+       "      <td>21.840</td>\n",
+       "      <td>51.718</td>\n",
+       "      <td>66.197</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>154</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.BK.M</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>155</th>\n",
+       "      <td>GIN-GHG-Inventory</td>\n",
+       "      <td>BUR1</td>\n",
+       "      <td>measured</td>\n",
+       "      <td>GIN</td>\n",
+       "      <td>CO2</td>\n",
+       "      <td>Gg CO2 / yr</td>\n",
+       "      <td>M.MULTIOP</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>156 rows × 15 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                source scenario (PRIMAP) provenance area (ISO3) entity  \\\n",
+       "0    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "1    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "2    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "3    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "4    GIN-GHG-Inventory              BUR1   measured         GIN    CH4   \n",
+       "..                 ...               ...        ...         ...    ...   \n",
+       "151  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "152  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "153  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "154  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "155  GIN-GHG-Inventory              BUR1   measured         GIN    CO2   \n",
+       "\n",
+       "            unit category (IPCC1996_2006_GIN_Inv)    1990    1995     2000  \\\n",
+       "0    Gg CH4 / yr                                0  65.202  93.368  119.981   \n",
+       "1    Gg CH4 / yr                                1   6.465   7.066    6.489   \n",
+       "2    Gg CH4 / yr                              1.A   6.465   7.066    6.489   \n",
+       "3    Gg CH4 / yr                            1.A.1   0.032   0.027    0.024   \n",
+       "4    Gg CH4 / yr                            1.A.2   0.006   0.012    0.018   \n",
+       "..           ...                              ...     ...     ...      ...   \n",
+       "151  Gg CO2 / yr                                5     NaN     NaN      NaN   \n",
+       "152  Gg CO2 / yr                             M.BK   0.719   1.438    2.158   \n",
+       "153  Gg CO2 / yr                           M.BK.A   0.719   1.438    2.158   \n",
+       "154  Gg CO2 / yr                           M.BK.M     NaN     NaN      NaN   \n",
+       "155  Gg CO2 / yr                        M.MULTIOP   0.000   0.000    0.000   \n",
+       "\n",
+       "        2005     2010     2015     2018     2019  \n",
+       "0    152.272  196.057  253.025  296.416  312.034  \n",
+       "1      5.984    4.849    5.360    5.931    5.866  \n",
+       "2      5.984    4.849    5.360    5.931    5.866  \n",
+       "3      0.020    0.016    0.002    0.005    0.001  \n",
+       "4      0.023    0.028    0.024    0.026    0.033  \n",
+       "..       ...      ...      ...      ...      ...  \n",
+       "151      NaN      NaN      NaN      NaN      NaN  \n",
+       "152   19.529   36.900   21.840   51.718   66.197  \n",
+       "153   19.529   36.900   21.840   51.718   66.197  \n",
+       "154      NaN      NaN      NaN      NaN      NaN  \n",
+       "155    0.000    0.000    0.000    0.000    0.000  \n",
+       "\n",
+       "[156 rows x 15 columns]"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#%matplotlib widget \n",
+    "#camelot.plot(tables_inventory_original[0], kind='text')\n",
+    "\n",
+    "df_main_dict = {}\n",
+    "pages = ['131', '132', '133', '134', '135', '136', '137']\n",
+    "entities = ['CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOCs', 'SO2']\n",
+    "\n",
+    "# for this set of tables every page is a different entity\n",
+    "for page, entity in zip(pages, entities):\n",
+    "\n",
+    "    print(\"-\"*45)\n",
+    "    print(f\"Reading table for page {page} and entity {entity}.\")\n",
+    "    \n",
+    "    # first table needs to be read in with flavor=\"stream\"\n",
+    "    # flavor=\"lattice\" raises an error, maybe camelot issue\n",
+    "    # see https://github.com/atlanhq/camelot/issues/306\n",
+    "    # or because characters in first row almost reach\n",
+    "    # the table grid    \n",
+    "    if page == '131':\n",
+    "        tables_inventory_original = camelot.read_pdf(\n",
+    "            str(input_folder / pdf_file),\n",
+    "            pages=page,\n",
+    "            table_areas=page_def_templates[page][\"area\"],\n",
+    "            columns=page_def_templates[page][\"cols\"],\n",
+    "            flavor=\"stream\",\n",
+    "            split_text=True\n",
+    "        )\n",
+    "        \n",
+    "        df_trend_entity = tables_inventory_original[0].df[1:]\n",
+    "    else:\n",
+    "        tables_inventory_original = camelot.read_pdf(\n",
+    "            str(input_folder / pdf_file),\n",
+    "            pages=page,\n",
+    "            flavor=\"lattice\",\n",
+    "            split_text=True)\n",
+    "        df_trend_entity = tables_inventory_original[0].df[3:]\n",
+    "\n",
+    "    print(f\"Reading complete.\")\n",
+    "\n",
+    "    # add columns\n",
+    "    # 'data' prefix is needed for pd.wide_to_long() later\n",
+    "    columns_years = ['data1990', 'data1995', \"data2000\", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']\n",
+    "    df_trend_entity.columns = ['orig_cat_name'] + columns_years\n",
+    "    \n",
+    "    # unit is always Gg\n",
+    "    df_trend_entity['unit'] = 'Gg'\n",
+    "    \n",
+    "    # only one entity per table\n",
+    "    df_trend_entity['entity'] = entity\n",
+    "    \n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"orig_cat_name\"]\n",
+    "\n",
+    "    # delete rows that are just a headline or empty\n",
+    "    #row_to_delete = df_trend_entity.index[df_trend_entity['category'] == 'Éléments pour mémoire'][0]\n",
+    "    #df_trend_entity = df_trend_entity.drop(index = row_to_delete)\n",
+    "\n",
+    "    # in the first table there is no empty line\n",
+    "    if page != '131':\n",
+    "        row_to_delete = df_trend_entity.index[df_trend_entity['category'] == ''][0]\n",
+    "        df_trend_entity = df_trend_entity.drop(index = row_to_delete)\n",
+    "        \n",
+    "    inv_conf[\"cat_code_regexp\"] = r'^(?P<code>[a-zA-Z0-9\\.]{1,11})[\\s\\.].*'\n",
+    "\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].replace(\n",
+    "        {\n",
+    "         'Total des émissions et absorptions nationales': \"0\",\n",
+    "         '2A5: Autre' : '2A5',\n",
+    "         'Éléments pour mémoire': 'MEMO',\n",
+    "         'Soutes internationales' : 'M.BK',\n",
+    "         '1.A.3.a.i - Aviation internationale (soutes internationales)' : 'M.BK.A',\n",
+    "         '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',\n",
+    "         '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',\n",
+    "        })\n",
+    "\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].str.replace(\".\", \"\")\n",
+    "    df_trend_entity[\"category\"] = df_trend_entity[\"category\"].str.replace(\"\\n\", \"\")\n",
+    "    \n",
+    "    \n",
+    "    repl = lambda m: m.group('code')\n",
+    "    df_trend_entity[\"category\"] = \\\n",
+    "        df_trend_entity[\"category\"].str.replace(inv_conf[\"cat_code_regexp\"], repl,\n",
+    "                                              regex=True)\n",
+    "    \n",
+    "    df_trend_entity = df_trend_entity.reset_index(drop=True)\n",
+    "    \n",
+    "    print(f\"Created category codes.\")\n",
+    "    \n",
+    "    for year in columns_years:\n",
+    "        df_trend_entity[year] = df_trend_entity[year].str.replace(\",\", \".\")\n",
+    "        df_trend_entity[year] = df_trend_entity[year].str.replace(\"NE1\", \"NE\")\n",
+    "    \n",
+    "    # make sure all col headers are str\n",
+    "    df_trend_entity.columns = df_trend_entity.columns.map(str)\n",
+    "    \n",
+    "    df_trend_entity = df_trend_entity.drop(columns=[\"orig_cat_name\"])\n",
+    "    \n",
+    "    df_trend_entity_long = pd.wide_to_long(df_trend_entity, stubnames='data',  i='category', j='time')\n",
+    "    \n",
+    "    print(f\"Converted to long format.\")\n",
+    "    \n",
+    "    df_trend_entity_long = df_trend_entity_long.reset_index()\n",
+    "    \n",
+    "    df_main_dict[page] =  df_trend_entity_long\n",
+    "\n",
+    "print(\"Converting to interchange format.\")\n",
+    "\n",
+    "df_trend_all = pd.concat([df_main_dict['131'], df_main_dict['132']], axis=0, join='outer').reset_index(drop=True)\n",
+    "\n",
+    "df_trend_IF = pm2.pm2io.convert_long_dataframe_if(\n",
+    "    df_trend_all,\n",
+    "    coords_cols=coords_cols,\n",
+    "    #add_coords_cols=add_coords_cols,\n",
+    "    coords_defaults=coords_defaults,\n",
+    "    coords_terminologies=coords_terminologies,\n",
+    "    coords_value_mapping=coords_value_mapping['trend'],\n",
+    "    #coords_value_filling=coords_value_filling,\n",
+    "    filter_remove=filter_remove,\n",
+    "    #filter_keep=filter_keep,\n",
+    "    meta_data=meta_data,\n",
+    "    convert_str=True,\n",
+    "    time_format=\"%Y\",\n",
+    "    )\n",
+    "    \n",
+    "df_trend_IF\n",
+    "       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "05e1ad4f-c35c-460c-8546-5e493f363739",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 09:52:43.765\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2.pm2io._interchange_format\u001b[0m:\u001b[36mfrom_interchange_format\u001b[0m:\u001b[36m320\u001b[0m - \u001b[34m\u001b[1mExpected array shapes: [[1, 1, 1, 1, 2, 78], [1, 1, 1, 1, 2, 78]], resulting in size 312.\u001b[0m\n",
+      "\u001b[32m2024-03-22 09:52:43.826\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mprimap2._data_format\u001b[0m:\u001b[36mensure_valid_attributes\u001b[0m:\u001b[36m292\u001b[0m - \u001b[1mReference information is not a DOI: 'placeholder'\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "### convert to primap2 format ###\n",
+    "data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3b65227-b7c4-4d18-89ef-af927c9a81b5",
+   "metadata": {},
+   "source": [
+    "# Combine tables and save to IF and native format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "960117b6-28fc-45ba-a768-16f63e428875",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-03-22 10:09:36.801\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for CH4\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.026\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for CO2\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.187\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for N2O\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.351\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for SO2\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.448\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge\u001b[0m:\u001b[36m230\u001b[0m - \u001b[34m\u001b[1mmerging for NMVOC\u001b[0m\n",
+      "\u001b[32m2024-03-22 10:09:37.533\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[36mprimap2._merge\u001b[0m:\u001b[36mmerge_with_tolerance_core\u001b[0m:\u001b[36m74\u001b[0m - \u001b[31m\u001b[1mpr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\n",
+      "shown are relative discrepancies.\n",
+      "               NMVOC\n",
+      "time                \n",
+      "1990-01-01  0.800000\n",
+      "2000-01-01  0.800000\n",
+      "2010-01-01  0.869848\u001b[0m\n"
+     ]
+    },
+    {
+     "ename": "MergeError",
+     "evalue": "pr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\nshown are relative discrepancies.\n               NMVOC\ntime                \n1990-01-01  0.800000\n2000-01-01  0.800000\n2010-01-01  0.869848",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mMergeError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[72], line 10\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#### combine\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m#data_pm2_main\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      8\u001b[0m \n\u001b[1;32m      9\u001b[0m \u001b[38;5;66;03m# tolerance needs to be high as rounding in trend tables leads to inconsistent data\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m data_pm2 \u001b[38;5;241m=\u001b[39m \u001b[43mdata_pm2_main\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_pm2_energy\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.11\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Documents/UNFCCC_non-AnnexI_data/venv/lib/python3.12/site-packages/primap2/_merge.py:231\u001b[0m, in \u001b[0;36mDatasetMergeAccessor.merge\u001b[0;34m(self, ds_merge, tolerance, error_on_discrepancy, combine_attrs)\u001b[0m\n\u001b[1;32m    229\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m var \u001b[38;5;129;01min\u001b[39;00m vars_common:\n\u001b[1;32m    230\u001b[0m     logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmerging for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 231\u001b[0m     ds_result_new \u001b[38;5;241m=\u001b[39m \u001b[43mmerge_with_tolerance_core\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    232\u001b[0m \u001b[43m        \u001b[49m\u001b[43mda_start\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mds_start\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    233\u001b[0m \u001b[43m        \u001b[49m\u001b[43mda_merge\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mds_merge\u001b[49m\u001b[43m[\u001b[49m\u001b[43mvar\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtolerance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    235\u001b[0m \u001b[43m        \u001b[49m\u001b[43merror_on_discrepancy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_on_discrepancy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    237\u001b[0m     ds_result \u001b[38;5;241m=\u001b[39m xr\u001b[38;5;241m.\u001b[39mmerge([ds_result, ds_result_new], combine_attrs\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverride\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds_result\n",
+      "File \u001b[0;32m~/Documents/UNFCCC_non-AnnexI_data/venv/lib/python3.12/site-packages/primap2/_merge.py:75\u001b[0m, in \u001b[0;36mmerge_with_tolerance_core\u001b[0;34m(da_start, da_merge, tolerance, error_on_discrepancy)\u001b[0m\n\u001b[1;32m     73\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_on_discrepancy:\n\u001b[1;32m     74\u001b[0m     logger\u001b[38;5;241m.\u001b[39merror(log_message)\n\u001b[0;32m---> 75\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m xr\u001b[38;5;241m.\u001b[39mMergeError(log_message)\n\u001b[1;32m     76\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     77\u001b[0m     \u001b[38;5;66;03m# log warning, continue with merging\u001b[39;00m\n\u001b[1;32m     78\u001b[0m     logger\u001b[38;5;241m.\u001b[39mwarning(log_message)\n",
+      "\u001b[0;31mMergeError\u001b[0m: pr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:\nshown are relative discrepancies.\n               NMVOC\ntime                \n1990-01-01  0.800000\n2000-01-01  0.800000\n2010-01-01  0.869848"
+     ]
+    }
+   ],
+   "source": [
+    "#### combine\n",
+    "\n",
+    "#data_pm2_main\n",
+    "#data_pm2_trend\n",
+    "#data_pm2_energy\n",
+    "#data_pm2_lulucf\n",
+    "#data_pm2_waste\n",
+    "\n",
+    "# tolerance needs to be high as rounding in trend tables leads to inconsistent data\n",
+    "data_pm2 = data_pm2_main.pr.merge(data_pm2_energy,tolerance=0.11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cb74c9e-b400-454b-848a-28091b832016",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert back to IF to have units in the fixed format\n",
+    "data_if = data_pm2.pr.to_interchange_format()\n",
+    "\n",
+    "# ###\n",
+    "# save data to IF and native format\n",
+    "# ###\n",
+    "pm2.pm2io.write_interchange_format(\n",
+    "    output_folder / (output_filename + coords_terminologies[\"category\"] + \"_raw\"), data_if)\n",
+    "\n",
+    "encoding = {var: compression for var in data_pm2.data_vars}\n",
+    "data_pm2.pr.to_netcdf(\n",
+    "    output_folder / (output_filename + coords_terminologies[\"category\"] + \"_raw.nc\"),\n",
+    "    encoding=encoding)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}