Browse Source

Docstrings and dealing with ruff messages for unfccc_reader (not final as black is fighting ruff)

Johannes Gütschow 1 year ago
parent
commit
06a9aceb8e
65 changed files with 7831 additions and 4903 deletions
  1. 4 0
      Makefile
  2. 7 8
      docs/source/conf.py
  3. 15 1
      poetry.lock
  4. 1 0
      pyproject.toml
  5. 8 3
      src/unfccc_ghg_data/__init__.py
  6. 1 1
      src/unfccc_ghg_data/helper/__init__.py
  7. 234 109
      src/unfccc_ghg_data/helper/definitions.py
  8. 6 5
      src/unfccc_ghg_data/helper/folder_mapping.py
  9. 4 6
      src/unfccc_ghg_data/helper/functions.py
  10. 286 176
      src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_core.py
  11. 8 7
      src/unfccc_ghg_data/unfccc_di_reader/read_unfccc_di_for_country.py
  12. 5 8
      src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_bur.py
  13. 3 7
      src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_nc.py
  14. 27 4
      src/unfccc_ghg_data/unfccc_reader/Argentina/__init__.py
  15. 115 96
      src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR4_from_pdf.py
  16. 28 5
      src/unfccc_ghg_data/unfccc_reader/Chile/__init__.py
  17. 288 141
      src/unfccc_ghg_data/unfccc_reader/Chile/config_chl_bur4.py
  18. 90 52
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py
  19. 100 55
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py
  20. 30 1
      src/unfccc_ghg_data/unfccc_reader/Colombia/__init__.py
  21. 104 84
      src/unfccc_ghg_data/unfccc_reader/Colombia/read_COL_BUR3_from_xlsx.py
  22. 30 0
      src/unfccc_ghg_data/unfccc_reader/Indonesia/__init__.py
  23. 167 100
      src/unfccc_ghg_data/unfccc_reader/Indonesia/read_IDN_BUR3_from_pdf.py
  24. 30 0
      src/unfccc_ghg_data/unfccc_reader/Israel/__init__.py
  25. 409 314
      src/unfccc_ghg_data/unfccc_reader/Israel/config_isr_bur2.py
  26. 121 77
      src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py
  27. 30 0
      src/unfccc_ghg_data/unfccc_reader/Malaysia/__init__.py
  28. 922 602
      src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur3.py
  29. 258 253
      src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur4.py
  30. 82 52
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py
  31. 84 55
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py
  32. 30 0
      src/unfccc_ghg_data/unfccc_reader/Mexico/__init__.py
  33. 81 35
      src/unfccc_ghg_data/unfccc_reader/Mexico/config_mex_bur3.py
  34. 63 66
      src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py
  35. 30 0
      src/unfccc_ghg_data/unfccc_reader/Montenegro/__init__.py
  36. 103 47
      src/unfccc_ghg_data/unfccc_reader/Montenegro/config_mne_bur3.py
  37. 88 56
      src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py
  38. 30 0
      src/unfccc_ghg_data/unfccc_reader/Morocco/__init__.py
  39. 187 108
      src/unfccc_ghg_data/unfccc_reader/Morocco/config_mar_bur3.py
  40. 122 88
      src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py
  41. 30 0
      src/unfccc_ghg_data/unfccc_reader/Nigeria/__init__.py
  42. 294 272
      src/unfccc_ghg_data/unfccc_reader/Nigeria/config_nga_bur2.py
  43. 137 103
      src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py
  44. 30 0
      src/unfccc_ghg_data/unfccc_reader/Peru/__init__.py
  45. 77 66
      src/unfccc_ghg_data/unfccc_reader/Peru/config_per_bur3.py
  46. 33 20
      src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py
  47. 30 0
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/__init__.py
  48. 511 403
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/config_kor_bur4.py
  49. 125 76
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py
  50. 140 82
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py
  51. 75 47
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
  52. 30 0
      src/unfccc_ghg_data/unfccc_reader/Singapore/__init__.py
  53. 408 256
      src/unfccc_ghg_data/unfccc_reader/Singapore/config_sgp_bur5.py
  54. 110 72
      src/unfccc_ghg_data/unfccc_reader/Singapore/read_SGP_BUR5_from_pdf.py
  55. 30 0
      src/unfccc_ghg_data/unfccc_reader/Taiwan/__init__.py
  56. 194 120
      src/unfccc_ghg_data/unfccc_reader/Taiwan/config_twn_nir2022.py
  57. 164 104
      src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py
  58. 30 0
      src/unfccc_ghg_data/unfccc_reader/Thailand/__init__.py
  59. 405 223
      src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur3.py
  60. 461 250
      src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur4.py
  61. 129 89
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py
  62. 90 64
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py
  63. 16 1
      src/unfccc_ghg_data/unfccc_reader/__init__.py
  64. 26 19
      src/unfccc_ghg_data/unfccc_reader/get_submissions_info.py
  65. 25 14
      src/unfccc_ghg_data/unfccc_reader/read_UNFCCC_submission.py

+ 4 - 0
Makefile

@@ -40,6 +40,10 @@ black:  ## format the code using black
 ruff-fixes:  ## fix the code using ruff
 ruff-fixes:  ## fix the code using ruff
 	poetry run ruff src tests scripts docs/source/conf.py docs/source/notebooks/*.py --fix
 	poetry run ruff src tests scripts docs/source/conf.py docs/source/notebooks/*.py --fix
 
 
+.PHONY: ruff-fixes-current
+ruff-fixes-current:  ## fix the code using ruff
+	poetry run ruff src/unfccc_ghg_data/unfccc_reader --fix
+
 
 
 .PHONY: test
 .PHONY: test
 test:  ## run the tests
 test:  ## run the tests

+ 7 - 8
docs/source/conf.py

@@ -4,17 +4,16 @@ Configuration file for the Sphinx documentation builder.
 For the full list of built-in configuration values, see the documentation:
 For the full list of built-in configuration values, see the documentation:
 https://www.sphinx-doc.org/en/master/usage/configuration.html
 https://www.sphinx-doc.org/en/master/usage/configuration.html
 """
 """
+import os
 from functools import wraps
 from functools import wraps
+from pathlib import Path
 
 
 from sphinxcontrib_autodocgen import AutoDocGen
 from sphinxcontrib_autodocgen import AutoDocGen
 
 
-import os
-from pathlib import Path
 os.environ["UNFCCC_GHG_ROOT_PATH"] = str(Path("..") / "..")
 os.environ["UNFCCC_GHG_ROOT_PATH"] = str(Path("..") / "..")
 
 
 import unfccc_ghg_data
 import unfccc_ghg_data
 
 
-
 # -- Project information -----------------------------------------------------
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 
@@ -58,7 +57,7 @@ extensions = [
     # math support
     # math support
     "sphinx.ext.mathjax",
     "sphinx.ext.mathjax",
     # execute code
     # execute code
-    # "sphinx_exec_code",
+    "sphinx_exec_code",
 ]
 ]
 
 
 # general sphinx settings
 # general sphinx settings
@@ -144,10 +143,10 @@ nb_execution_show_tb = True
 nb_execution_timeout = 120
 nb_execution_timeout = 120
 nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
 nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
 
 
-# # exec-code config
-# exec_code_working_dir = Path('..') / '..'
-# exec_code_source_folders = [Path('..') / '..' / 'src' / 'unfccc_ghg_data']
-# exec_code_example_dir = '.'
+# exec-code config
+exec_code_working_dir = "."  # Path('..') / '..'
+exec_code_source_folders = [Path("..") / ".." / "src" / "unfccc_ghg_data"]
+exec_code_example_dir = "."
 
 
 # -- Options for HTML output -------------------------------------------------
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

+ 15 - 1
poetry.lock

@@ -1068,6 +1068,20 @@ smb = ["smbprotocol"]
 ssh = ["paramiko"]
 ssh = ["paramiko"]
 tqdm = ["tqdm"]
 tqdm = ["tqdm"]
 
 
+[[package]]
+name = "ghostscript"
+version = "0.7"
+description = "Interface to the Ghostscript C-API, both high- and low-level, based on ctypes"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ghostscript-0.7-py2.py3-none-any.whl", hash = "sha256:97c70e27ba6b1cab4ab1d9b4cc82d89b8b53e57971f608ded4950b8aa20c78a7"},
+    {file = "ghostscript-0.7.tar.gz", hash = "sha256:b7875a87098740eb0be3de2d9662d15db727305ca9a6d4b7534a3cc33a4b965a"},
+]
+
+[package.dependencies]
+setuptools = ">=38.6.0"
+
 [[package]]
 [[package]]
 name = "globalwarmingpotentials"
 name = "globalwarmingpotentials"
 version = "0.9.3"
 version = "0.9.3"
@@ -4375,4 +4389,4 @@ plots = ["matplotlib"]
 [metadata]
 [metadata]
 lock-version = "2.0"
 lock-version = "2.0"
 python-versions = "^3.9"
 python-versions = "^3.9"
-content-hash = "db0b517e6af6c99b04624df636fc38cdf49b3ec8dd6dce24596da1cf5796c0ac"
+content-hash = "3591f5e1b1134c148b9f68e3861beb4961659d1af5cb4dd7360ef5396a682f2e"

+ 1 - 0
pyproject.toml

@@ -22,6 +22,7 @@ opencv-python = "^4.8.1.78"
 unfccc-di-api = "^4.0.0"
 unfccc-di-api = "^4.0.0"
 dask = "^2023.12.0"
 dask = "^2023.12.0"
 sphinx-exec-code = "^0.10"
 sphinx-exec-code = "^0.10"
+ghostscript = "^0.7"
 
 
 [tool.poetry.extras]
 [tool.poetry.extras]
 plots = ["matplotlib"]
 plots = ["matplotlib"]

+ 8 - 3
src/unfccc_ghg_data/__init__.py

@@ -6,15 +6,20 @@ different methods from APIs, xlsx and csv files as well as pdf files.
 """
 """
 import importlib.metadata
 import importlib.metadata
 
 
-from . import (helper, unfccc_reader, unfccc_downloader, unfccc_crf_reader,
-               unfccc_di_reader)
+from . import (
+    helper,
+    unfccc_crf_reader,
+    unfccc_di_reader,
+    unfccc_downloader,
+    unfccc_reader,
+)
 
 
 __all__ = [
 __all__ = [
     "helper",
     "helper",
     "unfccc_reader",
     "unfccc_reader",
     "unfccc_crf_reader",
     "unfccc_crf_reader",
     "unfccc_di_reader",
     "unfccc_di_reader",
-    "unfccc_downloader"
+    "unfccc_downloader",
 ]
 ]
 
 
 __version__ = importlib.metadata.version("unfccc_ghg_data")
 __version__ = importlib.metadata.version("unfccc_ghg_data")

+ 1 - 1
src/unfccc_ghg_data/helper/__init__.py

@@ -25,10 +25,10 @@ from .functions import (
     convert_categories,
     convert_categories,
     create_folder_mapping,
     create_folder_mapping,
     fix_rows,
     fix_rows,
+    get_code_file,
     get_country_code,
     get_country_code,
     get_country_name,
     get_country_name,
     process_data_for_country,
     process_data_for_country,
-    get_code_file,
 )
 )
 
 
 __all__ = [
 __all__ = [

+ 234 - 109
src/unfccc_ghg_data/helper/definitions.py

@@ -6,14 +6,14 @@ from pathlib import Path
 
 
 def get_root_path() -> Path:
 def get_root_path() -> Path:
     """Get the root_path from an environment variable"""
     """Get the root_path from an environment variable"""
-    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
+    root_path_env = os.getenv("UNFCCC_GHG_ROOT_PATH", None)
     if root_path_env is None:
     if root_path_env is None:
-        raise ValueError('UNFCCC_GHG_ROOT_PATH environment '
-                         'variable needs to be set') # noqa: TRY003
+        raise ValueError("UNFCCC_GHG_ROOT_PATH environment " "variable needs to be set")
     else:
     else:
         root_path = Path(root_path_env).resolve()
         root_path = Path(root_path_env).resolve()
     return root_path
     return root_path
 
 
+
 root_path = get_root_path()
 root_path = get_root_path()
 code_path = root_path / "src" / "unfccc_ghg_data"
 code_path = root_path / "src" / "unfccc_ghg_data"
 log_path = root_path / "log"
 log_path = root_path / "log"
@@ -36,125 +36,250 @@ custom_country_mapping = {
 }
 }
 
 
 custom_folders = {
 custom_folders = {
-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
-    'Micronesia_(Federated_State_of)': 'FSM',
-    'Micronesia_(Federated_States_of)': 'FSM',
-    'The_Republic_of_North_Macedonia': 'MKD',
-    'Republic_of_Korea': 'KOR',
-    'Bolivia_(Plurinational_State_of)': 'BOL',
-    'Türkiye': 'TUR',
-    'Iran_(Islamic_Republic_of)': 'IRN',
-    'Côte_d`Ivoire': 'CIV',
-    'Democratic_Republic_of_the_Congo': "COD",
-    'European_Union': 'EUA',
-    'Taiwan': 'TWN',
+    "Venezeula_(Bolivarian_Republic_of)": "VEN",
+    "Venezuela_(Bolivarian_Republic_of)": "VEN",
+    "Micronesia_(Federated_State_of)": "FSM",
+    "Micronesia_(Federated_States_of)": "FSM",
+    "The_Republic_of_North_Macedonia": "MKD",
+    "Republic_of_Korea": "KOR",
+    "Bolivia_(Plurinational_State_of)": "BOL",
+    "Türkiye": "TUR",
+    "Iran_(Islamic_Republic_of)": "IRN",
+    "Côte_d`Ivoire": "CIV",
+    "Democratic_Republic_of_the_Congo": "COD",
+    "European_Union": "EUA",
+    "Taiwan": "TWN",
 }
 }
 
 
 GWP_factors = {
 GWP_factors = {
-    'SARGWP100_to_AR4GWP100': {
-        'HFCS': 1.1,
-        'PFCS': 1.1,
-        'UnspMixOfHFCs': 1.1,
-        'UnspMixOfPFCs': 1.1,
-        'FGASES': 1.1,
+    "SARGWP100_to_AR4GWP100": {
+        "HFCS": 1.1,
+        "PFCS": 1.1,
+        "UnspMixOfHFCs": 1.1,
+        "UnspMixOfPFCs": 1.1,
+        "FGASES": 1.1,
     },
     },
-    'SARGWP100_to_AR5GWP100': {
-        'HFCS': 1.2,
-        'PFCS': 1.2,
-        'UnspMixOfHFCs': 1.2,
-        'UnspMixOfPFCs': 1.2,
-        'FGASES': 1.2,
+    "SARGWP100_to_AR5GWP100": {
+        "HFCS": 1.2,
+        "PFCS": 1.2,
+        "UnspMixOfHFCs": 1.2,
+        "UnspMixOfPFCs": 1.2,
+        "FGASES": 1.2,
     },
     },
-    'SARGWP100_to_AR6GWP100': {
-        'HFCS': 1.4,
-        'PFCS': 1.3,
-        'UnspMixOfHFCs': 1.4,
-        'UnspMixOfPFCs': 1.3,
-        'FGASES': 1.35,
+    "SARGWP100_to_AR6GWP100": {
+        "HFCS": 1.4,
+        "PFCS": 1.3,
+        "UnspMixOfHFCs": 1.4,
+        "UnspMixOfPFCs": 1.3,
+        "FGASES": 1.35,
     },
     },
-    'AR4GWP100_to_SARGWP100': {
-        'HFCS': 0.91,
-        'PFCS': 0.91,
-        'UnspMixOfHFCs': 0.91,
-        'UnspMixOfPFCs': 0.91,
-        'FGASES': 0.91,
+    "AR4GWP100_to_SARGWP100": {
+        "HFCS": 0.91,
+        "PFCS": 0.91,
+        "UnspMixOfHFCs": 0.91,
+        "UnspMixOfPFCs": 0.91,
+        "FGASES": 0.91,
     },
     },
-    'AR4GWP100_to_AR5GWP100': {
-        'HFCS': 1.1,
-        'PFCS': 1.1,
-        'UnspMixOfHFCs': 1.1,
-        'UnspMixOfPFCs': 1.1,
-        'FGASES': 1.1,
+    "AR4GWP100_to_AR5GWP100": {
+        "HFCS": 1.1,
+        "PFCS": 1.1,
+        "UnspMixOfHFCs": 1.1,
+        "UnspMixOfPFCs": 1.1,
+        "FGASES": 1.1,
     },
     },
-    'AR4GWP100_to_AR6GWP100': {
-        'HFCS': 1.27,
-        'PFCS': 1.18,
-        'UnspMixOfHFCs': 1.27,
-        'UnspMixOfPFCs': 1.18,
-        'FGASES': 1.23,
+    "AR4GWP100_to_AR6GWP100": {
+        "HFCS": 1.27,
+        "PFCS": 1.18,
+        "UnspMixOfHFCs": 1.27,
+        "UnspMixOfPFCs": 1.18,
+        "FGASES": 1.23,
     },
     },
-    'AR5GWP100_to_SARGWP100': {
-        'HFCS': 0.83,
-        'PFCS': 0.83,
-        'UnspMixOfHFCs': 0.83,
-        'UnspMixOfPFCs': 0.83,
-        'FGASES': 0.83,
+    "AR5GWP100_to_SARGWP100": {
+        "HFCS": 0.83,
+        "PFCS": 0.83,
+        "UnspMixOfHFCs": 0.83,
+        "UnspMixOfPFCs": 0.83,
+        "FGASES": 0.83,
     },
     },
-    'AR5GWP100_to_AR4GWP100': {
-        'HFCS': 0.91,
-        'PFCS': 0.91,
-        'UnspMixOfHFCs': 0.91,
-        'UnspMixOfPFCs': 0.91,
-        'FGASES': 0.91,
+    "AR5GWP100_to_AR4GWP100": {
+        "HFCS": 0.91,
+        "PFCS": 0.91,
+        "UnspMixOfHFCs": 0.91,
+        "UnspMixOfPFCs": 0.91,
+        "FGASES": 0.91,
     },
     },
-    'AR5GWP100_to_AR6GWP100': {
-        'HFCS': 1.17,
-        'PFCS': 1.08,
-        'UnspMixOfHFCs': 1.17,
-        'UnspMixOfPFCs': 1.08,
-        'FGASES': 1.125,
+    "AR5GWP100_to_AR6GWP100": {
+        "HFCS": 1.17,
+        "PFCS": 1.08,
+        "UnspMixOfHFCs": 1.17,
+        "UnspMixOfPFCs": 1.08,
+        "FGASES": 1.125,
     },
     },
 }
 }
 
 
 gas_baskets = {
 gas_baskets = {
-    'HFCS (SARGWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                     'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                     'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                     'HFC407c', 'HFC410a', 'HFC4310mee', #'OTHERHFCS (SARGWP100)',
-                         'UnspMixOfHFCs (SARGWP100)'],
-    'HFCS (AR4GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                     'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                     'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                     'HFC407c', 'HFC410a', 'HFC4310mee', 'UnspMixOfHFCs (AR4GWP100)'],
-    'HFCS (AR5GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                      'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                      'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                      'HFC407c', 'HFC410a', 'HFC4310mee',
-                         'UnspMixOfHFCs (AR5GWP100)'],
-    'HFCS (AR6GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                      'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                      'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                      'HFC407c', 'HFC410a', 'HFC4310mee',
-                         'UnspMixOfHFCs (AR6GWP100)'],
-    'PFCS (SARGWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (SARGWP100)'],
-    'PFCS (AR4GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR4GWP100)'],
-    'PFCS (AR5GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR5GWP100)'],
-    'PFCS (AR6GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR6GWP100)'],
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (SARGWP100)',
-                          'PFCS (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR4GWP100)',
-                          'PFCS (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR5GWP100)',
-                            'PFCS (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR6GWP100)',
-                            'PFCS (AR6GWP100)'],
+    "HFCS (SARGWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",  #'OTHERHFCS (SARGWP100)',
+        "UnspMixOfHFCs (SARGWP100)",
+    ],
+    "HFCS (AR4GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR4GWP100)",
+    ],
+    "HFCS (AR5GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR5GWP100)",
+    ],
+    "HFCS (AR6GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR6GWP100)",
+    ],
+    "PFCS (SARGWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (SARGWP100)",
+    ],
+    "PFCS (AR4GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR4GWP100)",
+    ],
+    "PFCS (AR5GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR5GWP100)",
+    ],
+    "PFCS (AR6GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR6GWP100)",
+    ],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (SARGWP100)",
+        "PFCS (SARGWP100)",
+    ],
+    "KYOTOGHG (AR4GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR4GWP100)",
+        "PFCS (AR4GWP100)",
+    ],
+    "KYOTOGHG (AR5GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR5GWP100)",
+        "PFCS (AR5GWP100)",
+    ],
+    "KYOTOGHG (AR6GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR6GWP100)",
+        "PFCS (AR6GWP100)",
+    ],
 }
 }

+ 6 - 5
src/unfccc_ghg_data/helper/folder_mapping.py

@@ -1,4 +1,4 @@
-""" create mapping of folder to countries
+"""create mapping of folder to countries
 
 
 this script takes a folder as input (from doit) and
 this script takes a folder as input (from doit) and
 runs creates the mapping of subfolders to country codes
 runs creates the mapping of subfolders to country codes
@@ -13,16 +13,17 @@ if __name__ == "__main__":
     # Find the right function and possible input and output files and
     # Find the right function and possible input and output files and
     # read the data using datalad run.
     # read the data using datalad run.
     parser = argparse.ArgumentParser()
     parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', help='folder name, relative to '
-                                         'repository root folder')
+    parser.add_argument(
+        "--folder", help="folder name, relative to " "repository root folder"
+    )
     args = parser.parse_args()
     args = parser.parse_args()
     folder = args.folder
     folder = args.folder
 
 
-    if 'extracted_data' in folder:
+    if "extracted_data" in folder:
         extracted = True
         extracted = True
     else:
     else:
         extracted = False
         extracted = False
 
 
     # print available submissions
     # print available submissions
-    print("="*10 + f" Creating folder mapping for  {folder} " + "="*10)
+    print("=" * 10 + f" Creating folder mapping for  {folder} " + "=" * 10)
     create_folder_mapping(folder, extracted)
     create_folder_mapping(folder, extracted)

+ 4 - 6
src/unfccc_ghg_data/helper/functions.py

@@ -1,4 +1,4 @@
-""" common functions for unfccc_ghg_data
+"""common functions for unfccc_ghg_data
 
 
 Functions used by the different readers and downloaders in the unfccc_ghg_data package
 Functions used by the different readers and downloaders in the unfccc_ghg_data package
 """
 """
@@ -74,8 +74,6 @@ def process_data_for_country(
     xr.Dataset: processed dataset
     xr.Dataset: processed dataset
 
 
     """
     """
-
-
     # 0: gather information
     # 0: gather information
     countries = list(data_country.coords[data_country.attrs["area"]].values)
     countries = list(data_country.coords[data_country.attrs["area"]].values)
     if len(countries) > 1:
     if len(countries) > 1:
@@ -956,9 +954,7 @@ def get_code_file(
                     )
                     )
                 else:
                 else:
                     if print_info:
                     if print_info:
-                        print(
-                            f"Found code file {file.relative_to(root_path)}"
-                        )
+                        print(f"Found code file {file.relative_to(root_path)}")
                 code_file_path = file
                 code_file_path = file
 
 
     if code_file_path is not None:
     if code_file_path is not None:
@@ -1011,8 +1007,10 @@ def fix_rows(
         new_row = new_row.str.replace("- ", "-")
         new_row = new_row.str.replace("- ", "-")
         # replace spaces in numbers
         # replace spaces in numbers
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
+
         def repl(m):
         def repl(m):
             return f"{m.group('first')}{m.group('last')}"
             return f"{m.group('first')}{m.group('last')}"
+
         new_row = new_row.str.replace(pat, repl, regex=True)
         new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])
         data = data.drop(indices_to_merge[1:])

+ 286 - 176
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_core.py

@@ -27,13 +27,13 @@ from .util import NoCRFFilesError
 
 
 ### reading functions
 ### reading functions
 def convert_crf_table_to_pm2if(
 def convert_crf_table_to_pm2if(
-        df_table: pd.DataFrame,
-        submission_year: int,
-        entity_mapping: Optional[dict[str,str]]=None,
-        coords_defaults_input: Optional[dict[str,str]]=None,
-        filter_remove_input: Optional[dict[str,dict[str,Union[str,list]]]]=None,
-        filter_keep_input: Optional[dict[str,dict[str,Union[str,list]]]]=None,
-        meta_data_input: Optional[dict[str,str]]=None,
+    df_table: pd.DataFrame,
+    submission_year: int,
+    entity_mapping: Optional[dict[str, str]] = None,
+    coords_defaults_input: Optional[dict[str, str]] = None,
+    filter_remove_input: Optional[dict[str, dict[str, Union[str, list]]]] = None,
+    filter_keep_input: Optional[dict[str, dict[str, Union[str, list]]]] = None,
+    meta_data_input: Optional[dict[str, str]] = None,
 ) -> pd.DataFrame:
 ) -> pd.DataFrame:
     """
     """
     Converts a given pandas long format crf table to PRIMAP2 interchange format
     Converts a given pandas long format crf table to PRIMAP2 interchange format
@@ -82,7 +82,7 @@ def convert_crf_table_to_pm2if(
     }
     }
 
 
     add_coords_cols = {
     add_coords_cols = {
-    #    "orig_cat_name": ["orig_cat_name", "category"],
+        #    "orig_cat_name": ["orig_cat_name", "category"],
     }
     }
 
 
     coords_terminologies = {
     coords_terminologies = {
@@ -108,8 +108,8 @@ def convert_crf_table_to_pm2if(
     if entity_mapping is not None:
     if entity_mapping is not None:
         coords_value_mapping["entity"] = entity_mapping
         coords_value_mapping["entity"] = entity_mapping
 
 
-    #coords_value_filling_template = {
-    #}
+    # coords_value_filling_template = {
+    # }
 
 
     filter_remove = {
     filter_remove = {
         "f1": {
         "f1": {
@@ -120,13 +120,11 @@ def convert_crf_table_to_pm2if(
         for key in filter_remove_input.keys():
         for key in filter_remove_input.keys():
             filter_remove[key] = filter_remove_input[key]
             filter_remove[key] = filter_remove_input[key]
 
 
-    filter_keep = {
-    }
+    filter_keep = {}
     if filter_keep_input is not None:
     if filter_keep_input is not None:
         for key in filter_keep_input.keys():
         for key in filter_keep_input.keys():
             filter_keep[key] = filter_keep_input[key]
             filter_keep[key] = filter_keep_input[key]
 
 
-
     meta_data = {
     meta_data = {
         "references": f"https://unfccc.int/ghg-inventories-annex-i-parties/{submission_year}",
         "references": f"https://unfccc.int/ghg-inventories-annex-i-parties/{submission_year}",
         "rights": "",
         "rights": "",
@@ -146,7 +144,7 @@ def convert_crf_table_to_pm2if(
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
@@ -156,13 +154,13 @@ def convert_crf_table_to_pm2if(
 
 
 
 
 def read_crf_table(
 def read_crf_table(
-        country_codes: Union[str, list[str]],
-        table: str,
-        submission_year: int,
-        data_year: Optional[Union[int, list[int]]]=None,
-        date: Optional[str]=None,
-        folder: Optional[str]=None,
-        debug: Optional[bool]=False,
+    country_codes: Union[str, list[str]],
+    table: str,
+    submission_year: int,
+    data_year: Optional[Union[int, list[int]]] = None,
+    date: Optional[str] = None,
+    folder: Optional[str] = None,
+    debug: Optional[bool] = False,
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
     """
     """
     Read CRF table for given submission year and country / or countries
     Read CRF table for given submission year and country / or countries
@@ -216,13 +214,15 @@ def read_crf_table(
         country_codes = [country_codes]
         country_codes = [country_codes]
 
 
     # get file names and locations
     # get file names and locations
-    input_files = get_crf_files(country_codes=country_codes,
-                                submission_year=submission_year,
-                                data_year=data_year,
-                                date=date,
-                                folder=folder)
+    input_files = get_crf_files(
+        country_codes=country_codes,
+        submission_year=submission_year,
+        data_year=data_year,
+        date=date,
+        folder=folder,
+    )
     # nasty fix for cases where exporting ran overnight and not all files have the same date
     # nasty fix for cases where exporting ran overnight and not all files have the same date
-    if (date is not None) and (len(country_codes)==1):
+    if (date is not None) and (len(country_codes) == 1):
         if isinstance(data_year, list):
         if isinstance(data_year, list):
             expected_files = len(data_year)
             expected_files = len(data_year)
         elif isinstance(data_year, int):
         elif isinstance(data_year, int):
@@ -230,17 +230,23 @@ def read_crf_table(
         else:
         else:
             expected_files = submission_year - 1990 - 1
             expected_files = submission_year - 1990 - 1
         if len(input_files) < expected_files:
         if len(input_files) < expected_files:
-            print(f"Found only {len(input_files)} input files for {country_codes}. "
-                  f"Expected {expected_files}.")
-            print("Possibly exporting run overnight and some files have the previous day as date.")
+            print(
+                f"Found only {len(input_files)} input files for {country_codes}. "
+                f"Expected {expected_files}."
+            )
+            print(
+                "Possibly exporting run overnight and some files have the previous day as date."
+            )
             date_datetime = datetime.strptime(date, "%d%m%Y")
             date_datetime = datetime.strptime(date, "%d%m%Y")
             date_datetime = date_datetime - timedelta(days=1)
             date_datetime = date_datetime - timedelta(days=1)
             prv_date = date_datetime.strftime("%d%m%Y")
             prv_date = date_datetime.strftime("%d%m%Y")
-            more_input_files = get_crf_files(country_codes=country_codes,
-                                             submission_year=submission_year,
-                                             data_year=data_year,
-                                             date=prv_date,
-                                             folder=folder)
+            more_input_files = get_crf_files(
+                country_codes=country_codes,
+                submission_year=submission_year,
+                data_year=data_year,
+                date=prv_date,
+                folder=folder,
+            )
             if len(more_input_files) > 0:
             if len(more_input_files) > 0:
                 print(f"Found {len(more_input_files)} additional input files.")
                 print(f"Found {len(more_input_files)} additional input files.")
                 input_files = input_files + more_input_files
                 input_files = input_files + more_input_files
@@ -248,11 +254,13 @@ def read_crf_table(
                 print("Found no additional input files")
                 print("Found no additional input files")
 
 
     if input_files == []:
     if input_files == []:
-        raise NoCRFFilesError(f"No files found for {country_codes}, "
-                              f"submission_year={submission_year}, "
-                              f"data_year={data_year}, "
-                              f"date={date}, "
-                              f"folder={folder}.")
+        raise NoCRFFilesError(
+            f"No files found for {country_codes}, "
+            f"submission_year={submission_year}, "
+            f"data_year={data_year}, "
+            f"date={date}, "
+            f"folder={folder}."
+        )
 
 
     # get specification
     # get specification
     # if we only have a single country check if we might have a country specific
     # if we only have a single country check if we might have a country specific
@@ -260,21 +268,25 @@ def read_crf_table(
     if len(country_codes) == 1:
     if len(country_codes) == 1:
         try:
         try:
             crf_spec = getattr(crf, f"CRF{submission_year}_{country_codes[0]}")
             crf_spec = getattr(crf, f"CRF{submission_year}_{country_codes[0]}")
-            print(f"Using country specific specification: "
-                  f"CRF{submission_year}_{country_codes[0]}")
+            print(
+                f"Using country specific specification: "
+                f"CRF{submission_year}_{country_codes[0]}"
+            )
         except:
         except:
             # no country specific specification, check for general specification
             # no country specific specification, check for general specification
             try:
             try:
                 crf_spec = getattr(crf, f"CRF{submission_year}")
                 crf_spec = getattr(crf, f"CRF{submission_year}")
             except:
             except:
-                raise ValueError(f"No terminology exists for submission year "
-                                 f"{submission_year}")
+                raise ValueError(
+                    f"No terminology exists for submission year " f"{submission_year}"
+                )
     else:
     else:
         try:
         try:
             crf_spec = getattr(crf, f"CRF{submission_year}")
             crf_spec = getattr(crf, f"CRF{submission_year}")
         except:
         except:
-            raise ValueError(f"No terminology exists for submission year "
-                             f"{submission_year}")
+            raise ValueError(
+                f"No terminology exists for submission year " f"{submission_year}"
+            )
 
 
     # now loop over files and read them
     # now loop over files and read them
     df_all = None
     df_all = None
@@ -284,8 +296,11 @@ def read_crf_table(
         file_info = get_info_from_crf_filename(file.name)
         file_info = get_info_from_crf_filename(file.name)
         try:
         try:
             int(file_info["data_year"])
             int(file_info["data_year"])
-            df_this_file, unknown_rows_this_file, last_row_info_this_file = \
-                read_crf_table_from_file(file, table, crf_spec[table], debug=debug)
+            (
+                df_this_file,
+                unknown_rows_this_file,
+                last_row_info_this_file,
+            ) = read_crf_table_from_file(file, table, crf_spec[table], debug=debug)
             if df_all is None:
             if df_all is None:
                 df_all = df_this_file.copy(deep=True)
                 df_all = df_this_file.copy(deep=True)
                 unknown_rows = unknown_rows_this_file
                 unknown_rows = unknown_rows_this_file
@@ -301,10 +316,10 @@ def read_crf_table(
 
 
 
 
 def read_crf_table_from_file(
 def read_crf_table_from_file(
-        file: Path,
-        table: str,
-        table_spec: dict[str, dict],
-        debug: Optional[bool]=False,
+    file: Path,
+    table: str,
+    table_spec: dict[str, dict],
+    debug: Optional[bool] = False,
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
     """
     """
     Read a single CRF table from a given file. This is the core function of the CRF
     Read a single CRF table from a given file. This is the core function of the CRF
@@ -344,7 +359,6 @@ def read_crf_table_from_file(
             dlds = dl.api.Dataset(root_path)
             dlds = dl.api.Dataset(root_path)
             dlds.get(file.relative_to(root_path))
             dlds.get(file.relative_to(root_path))
 
 
-
     table_properties = table_spec["table"]
     table_properties = table_spec["table"]
     file_info = get_info_from_crf_filename(file.name)
     file_info = get_info_from_crf_filename(file.name)
 
 
@@ -353,16 +367,23 @@ def read_crf_table_from_file(
     all_cats = [cat[0] for cat in all_cats_mapping]
     all_cats = [cat[0] for cat in all_cats_mapping]
 
 
     unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count == 1]
     unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count == 1]
-    unique_cat_tuples = [mapping for mapping in all_cats_mapping if mapping[0] in unique_cats]
-    unique_mapping = dict(zip([tup[0] for tup in unique_cat_tuples],
-                              [tup[1] for tup in unique_cat_tuples]))
+    unique_cat_tuples = [
+        mapping for mapping in all_cats_mapping if mapping[0] in unique_cats
+    ]
+    unique_mapping = dict(
+        zip(
+            [tup[0] for tup in unique_cat_tuples], [tup[1] for tup in unique_cat_tuples]
+        )
+    )
     non_unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count > 1]
     non_unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count > 1]
 
 
     # prepare the sector hierarchy
     # prepare the sector hierarchy
     if non_unique_cats:
     if non_unique_cats:
         # if we have non-unique categories present we need the information on
         # if we have non-unique categories present we need the information on
         # levels within the category hierarchy
         # levels within the category hierarchy
-        category_tree = create_category_tree(all_cats_mapping, table, file_info["party"])
+        category_tree = create_category_tree(
+            all_cats_mapping, table, file_info["party"]
+        )
 
 
     # prepare index colum information
     # prepare index colum information
     cat_col = table_properties["col_for_categories"]
     cat_col = table_properties["col_for_categories"]
@@ -372,20 +393,37 @@ def read_crf_table_from_file(
     # read the data
     # read the data
     print(f"Reading table {table} for year {file_info['data_year']} from {file.name}.")
     print(f"Reading table {table} for year {file_info['data_year']} from {file.name}.")
     skiprows = table_properties["firstrow"] - 1
     skiprows = table_properties["firstrow"] - 1
-    nrows = table_properties["lastrow"] - skiprows + 1 # read one row more to check if we reached the end
+    nrows = (
+        table_properties["lastrow"] - skiprows + 1
+    )  # read one row more to check if we reached the end
     # we read with user specific NaN treatment as the NaN treatment is part of the conversion to
     # we read with user specific NaN treatment as the NaN treatment is part of the conversion to
     # PRIMAP2 format.
     # PRIMAP2 format.
-    df_raw = pd.read_excel(file, sheet_name=table, skiprows=skiprows , nrows=nrows, engine="openpyxl",
-                               na_values=['-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN',
-                                          'NULL', 'NaN', ''], keep_default_na=False)
+    df_raw = pd.read_excel(
+        file,
+        sheet_name=table,
+        skiprows=skiprows,
+        nrows=nrows,
+        engine="openpyxl",
+        na_values=[
+            "-1.#IND",
+            "-1.#QNAN",
+            "-NaN",
+            "-nan",
+            "1.#IND",
+            "1.#QNAN",
+            "NULL",
+            "NaN",
+            "",
+        ],
+        keep_default_na=False,
+    )
 
 
     if len(df_raw) < nrows:
     if len(df_raw) < nrows:
-        #print(f"read data truncated because of all-nan rows")
+        # print(f"read data truncated because of all-nan rows")
         last_row_nan = True
         last_row_nan = True
     else:
     else:
         last_row_nan = False
         last_row_nan = False
 
 
-
     cols_to_drop = []
     cols_to_drop = []
     # remove empty first column (for Australia tables start with an empty column)
     # remove empty first column (for Australia tables start with an empty column)
     # df_raw = df_raw.dropna(how="all", axis=1)
     # df_raw = df_raw.dropna(how="all", axis=1)
@@ -394,13 +432,14 @@ def read_crf_table_from_file(
     # select only first table by cutting everything after a all-nan column (unless
     # select only first table by cutting everything after a all-nan column (unless
     # it's the first column)
     # it's the first column)
     if debug:
     if debug:
-        print(f'Header before table end detection: {df_raw.columns.values}')
+        print(f"Header before table end detection: {df_raw.columns.values}")
     for colIdx in range(1, len(df_raw.columns.values)):
     for colIdx in range(1, len(df_raw.columns.values)):
-        if ((df_raw.iloc[:, colIdx].isna().all()) &
-                (df_raw.columns[colIdx].startswith('Unnamed'))):
+        if (df_raw.iloc[:, colIdx].isna().all()) & (
+            df_raw.columns[colIdx].startswith("Unnamed")
+        ):
             cols_to_drop = cols_to_drop + list(df_raw.columns.values[colIdx:])
             cols_to_drop = cols_to_drop + list(df_raw.columns.values[colIdx:])
             if debug:
             if debug:
-                print(f'cols_to_drop: {cols_to_drop}')
+                print(f"cols_to_drop: {cols_to_drop}")
             break
             break
 
 
     if cols_to_drop is not None:
     if cols_to_drop is not None:
@@ -414,7 +453,7 @@ def read_crf_table_from_file(
     # the filling leads to long and a bit confusing headers, but as long
     # the filling leads to long and a bit confusing headers, but as long
     # as pandas can not fill values of merged cells in all individual cells
     # as pandas can not fill values of merged cells in all individual cells
     # we have to use some filling algorithm.
     # we have to use some filling algorithm.
-    df_header = df_raw.iloc[0:len(table_properties["header"])-1].copy(deep=True)
+    df_header = df_raw.iloc[0 : len(table_properties["header"]) - 1].copy(deep=True)
     df_header.loc[-1] = df_header.columns.values
     df_header.loc[-1] = df_header.columns.values
     df_header.index = df_header.index + 1
     df_header.index = df_header.index + 1
     # replace "Unnamed: X" colum names by nan to fill from left in next step
     # replace "Unnamed: X" colum names by nan to fill from left in next step
@@ -447,15 +486,17 @@ def read_crf_table_from_file(
                         entities[col] = f"{entities[col]} {value}"
                         entities[col] = f"{entities[col]} {value}"
 
 
     if units is None:
     if units is None:
-        raise ValueError(f"Specification for table {table} does not contain unit information.")
+        raise ValueError(
+            f"Specification for table {table} does not contain unit information."
+        )
 
 
     # remove double spaces
     # remove double spaces
     entities = [entity.strip() for entity in entities]
     entities = [entity.strip() for entity in entities]
-    entities = [re.sub('\\s+', ' ', entity) for entity in entities]
+    entities = [re.sub("\\s+", " ", entity) for entity in entities]
 
 
     # replace the old header
     # replace the old header
     if len(header) > 2:
     if len(header) > 2:
-        df_current = df_raw.drop(index=df_raw.iloc[0:len(header)-2].index)
+        df_current = df_raw.drop(index=df_raw.iloc[0 : len(header) - 2].index)
     else:
     else:
         df_current = df_raw
         df_current = df_raw
 
 
@@ -469,11 +510,11 @@ def read_crf_table_from_file(
     # remove double spaces
     # remove double spaces
     for col in cols_for_space_stripping:
     for col in cols_for_space_stripping:
         df_current[col] = df_current[col].str.strip()
         df_current[col] = df_current[col].str.strip()
-        df_current[col] = df_current[col].replace('\\s+', ' ', regex=True)
+        df_current[col] = df_current[col].replace("\\s+", " ", regex=True)
 
 
     # prepare for sector mapping by initializing result lists and
     # prepare for sector mapping by initializing result lists and
     # variables
     # variables
-    new_cats = [[''] * len(table_properties["categories"])] * len(df_current)
+    new_cats = [[""] * len(table_properties["categories"])] * len(df_current)
 
 
     # copy the header rows which are not part of the index (unit)
     # copy the header rows which are not part of the index (unit)
     new_cats[0] = [df_current.iloc[0][cat_col]] * len(table_properties["categories"])
     new_cats[0] = [df_current.iloc[0][cat_col]] * len(table_properties["categories"])
@@ -485,7 +526,9 @@ def read_crf_table_from_file(
     if non_unique_cats:
     if non_unique_cats:
         # need to initialize the tree parsing.
         # need to initialize the tree parsing.
         last_parent = category_tree.get_node("root")
         last_parent = category_tree.get_node("root")
-        all_nodes = set([category_tree.get_node(node).tag for node in category_tree.nodes])
+        all_nodes = set(
+            [category_tree.get_node(node).tag for node in category_tree.nodes]
+        )
 
 
         for idx in range(1, len(df_current)):
         for idx in range(1, len(df_current)):
             current_cat = df_current.iloc[idx][cat_col]
             current_cat = df_current.iloc[idx][cat_col]
@@ -497,8 +540,12 @@ def read_crf_table_from_file(
                 break
                 break
 
 
             # check if current category is a child of the last node
             # check if current category is a child of the last node
-            children = dict([[child.tag, child.identifier]
-                        for child in category_tree.children(last_parent.identifier)])
+            children = dict(
+                [
+                    [child.tag, child.identifier]
+                    for child in category_tree.children(last_parent.identifier)
+                ]
+            )
             if current_cat in children.keys():
             if current_cat in children.keys():
                 # the current category is a child of the current parent
                 # the current category is a child of the current parent
                 # do the mapping
                 # do the mapping
@@ -517,21 +564,39 @@ def read_crf_table_from_file(
                 if current_cat in all_nodes:
                 if current_cat in all_nodes:
                     old_parent = last_parent
                     old_parent = last_parent
 
 
-                    while (current_cat not in children.keys()) and \
-                            (last_parent.identifier != "root"):
+                    while (current_cat not in children.keys()) and (
+                        last_parent.identifier != "root"
+                    ):
                         last_parent = category_tree.get_node(
                         last_parent = category_tree.get_node(
-                            last_parent.predecessor(category_tree.identifier))
-                        children = dict([[child.tag, child.identifier]
-                                    for child in category_tree.children(last_parent.identifier)])
-
-                    if (last_parent.identifier == "root") and \
-                        (current_cat not in children.keys()):
+                            last_parent.predecessor(category_tree.identifier)
+                        )
+                        children = dict(
+                            [
+                                [child.tag, child.identifier]
+                                for child in category_tree.children(
+                                    last_parent.identifier
+                                )
+                            ]
+                        )
+
+                    if (last_parent.identifier == "root") and (
+                        current_cat not in children.keys()
+                    ):
                         # we have not found the category as direct child of any of the
                         # we have not found the category as direct child of any of the
                         # predecessors. Thus it is missing in the specification in
                         # predecessors. Thus it is missing in the specification in
                         # that place
                         # that place
-                        print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, "
-                              f"{file_info['data_year']} (last parent: {old_parent.tag}).")
-                        unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                        print(
+                            f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, "
+                            f"{file_info['data_year']} (last parent: {old_parent.tag})."
+                        )
+                        unknown_categories.append(
+                            [
+                                table,
+                                file_info["party"],
+                                current_cat,
+                                file_info["data_year"],
+                            ]
+                        )
                         # copy back the parent info to continue with next category
                         # copy back the parent info to continue with next category
                         last_parent = old_parent
                         last_parent = old_parent
                     else:
                     else:
@@ -543,8 +608,12 @@ def read_crf_table_from_file(
                         if new_children:
                         if new_children:
                             last_parent = node
                             last_parent = node
                 else:
                 else:
-                    print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}.")
-                    unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                    print(
+                        f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}."
+                    )
+                    unknown_categories.append(
+                        [table, file_info["party"], current_cat, file_info["data_year"]]
+                    )
     else:
     else:
         for idx in range(1, len(df_current)):
         for idx in range(1, len(df_current)):
             current_cat = df_current.iloc[idx][cat_col]
             current_cat = df_current.iloc[idx][cat_col]
@@ -557,30 +626,45 @@ def read_crf_table_from_file(
             if current_cat in all_cats:
             if current_cat in all_cats:
                 new_cats[idx] = unique_mapping[current_cat]
                 new_cats[idx] = unique_mapping[current_cat]
                 if (idx == len(df_current) - 1) and not last_row_nan:
                 if (idx == len(df_current) - 1) and not last_row_nan:
-                    print(f"found information in last row: category {current_cat}, row {idx}")
-                    info_last_row.append([table, file_info["party"], current_cat, file_info['data_year']])
+                    print(
+                        f"found information in last row: category {current_cat}, row {idx}"
+                    )
+                    info_last_row.append(
+                        [table, file_info["party"], current_cat, file_info["data_year"]]
+                    )
             else:
             else:
-                print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}.")
-                unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                print(
+                    f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}."
+                )
+                unknown_categories.append(
+                    [table, file_info["party"], current_cat, file_info["data_year"]]
+                )
 
 
     for idx, col in enumerate(table_properties["categories"]):
     for idx, col in enumerate(table_properties["categories"]):
-        df_current.insert(loc=idx, column=col, value=
-                          [cat[idx] for cat in new_cats])
+        df_current.insert(loc=idx, column=col, value=[cat[idx] for cat in new_cats])
 
 
     # set index
     # set index
     df_current = df_current.set_index(index_cols)
     df_current = df_current.set_index(index_cols)
     # process the unit information using the primap2 functions
     # process the unit information using the primap2 functions
-    df_current = pm2.pm2io.nir_add_unit_information(df_current, **table_properties["unit_info"])
+    df_current = pm2.pm2io.nir_add_unit_information(
+        df_current, **table_properties["unit_info"]
+    )
 
 
     # convert to long format
     # convert to long format
-    header_long = table_properties["categories"] + \
-        ["orig_cat_name", "entity", "unit", "time", "data"]
+    header_long = table_properties["categories"] + [
+        "orig_cat_name",
+        "entity",
+        "unit",
+        "time",
+        "data",
+    ]
     df_long = pm2.pm2io.nir_convert_df_to_long(
     df_long = pm2.pm2io.nir_convert_df_to_long(
-        df_current, file_info["data_year"], header_long=header_long)
+        df_current, file_info["data_year"], header_long=header_long
+    )
 
 
     # add country information
     # add country information
     df_long.insert(0, column="country", value=file_info["party"])
     df_long.insert(0, column="country", value=file_info["party"])
-    #df_long.insert(1, column="submission", value=f"CRF{file_info['submission_year']}")
+    # df_long.insert(1, column="submission", value=f"CRF{file_info['submission_year']}")
     if "coords_defaults" in table_spec.keys():
     if "coords_defaults" in table_spec.keys():
         for col in table_spec["coords_defaults"]:
         for col in table_spec["coords_defaults"]:
             df_long.insert(2, column=col, value=table_spec["coords_defaults"][col])
             df_long.insert(2, column=col, value=table_spec["coords_defaults"][col])
@@ -589,18 +673,17 @@ def read_crf_table_from_file(
 
 
 
 
 def get_crf_files(
 def get_crf_files(
-        country_codes: Union[str, list[str]],
-        submission_year: int,
-        data_year: Optional[Union[int, list[int]]] = None,
-        date: Optional[str] = None,
-        folder: Optional[str] = None,
+    country_codes: Union[str, list[str]],
+    submission_year: int,
+    data_year: Optional[Union[int, list[int]]] = None,
+    date: Optional[str] = None,
+    folder: Optional[str] = None,
 ) -> list[Path]:
 ) -> list[Path]:
     """
     """
     Finds all files according to given parameters
     Finds all files according to given parameters
 
 
     Parameters
     Parameters
     ----------
     ----------
-
     country_codes: str or list[str]
     country_codes: str or list[str]
         ISO 3-letter country code or list of country codes
         ISO 3-letter country code or list of country codes
 
 
@@ -643,14 +726,20 @@ def get_crf_files(
                 new_country_folders = folder_mapping[country_code]
                 new_country_folders = folder_mapping[country_code]
                 if isinstance(new_country_folders, str):
                 if isinstance(new_country_folders, str):
                     # only one folder
                     # only one folder
-                    country_folders = [*country_folders, data_folder / new_country_folders / submission_folder]
+                    country_folders = [
+                        *country_folders,
+                        data_folder / new_country_folders / submission_folder,
+                    ]
                 else:
                 else:
-                    country_folders = country_folders + \
-                                      [data_folder / folder / submission_folder
-                                       for folder in new_country_folders]
+                    country_folders = country_folders + [
+                        data_folder / folder / submission_folder
+                        for folder in new_country_folders
+                    ]
             else:
             else:
-                raise ValueError(f"No data folder found for country {country_code}. "
-                                 f"Check if folder mapping is up to date.")
+                raise ValueError(
+                    f"No data folder found for country {country_code}. "
+                    f"Check if folder mapping is up to date."
+                )
     else:
     else:
         country_folders = [folder]
         country_folders = [folder]
 
 
@@ -671,17 +760,17 @@ def get_crf_files(
                     file_filter["party"] = country
                     file_filter["party"] = country
                     dates = get_submission_dates(folder, file_filter)
                     dates = get_submission_dates(folder, file_filter)
                     file_filter["date"] = find_latest_date(dates)
                     file_filter["date"] = find_latest_date(dates)
-                    input_files = input_files + \
-                                  filter_filenames(input_folder.glob("*.xlsx"),
-                                                   **file_filter)
+                    input_files = input_files + filter_filenames(
+                        input_folder.glob("*.xlsx"), **file_filter
+                    )
             else:
             else:
                 file_filter = file_filter_template.copy()
                 file_filter = file_filter_template.copy()
                 if date is not None:
                 if date is not None:
                     file_filter["date"] = date
                     file_filter["date"] = date
-                input_files = input_files + \
-                              filter_filenames(input_folder.glob("*.xlsx"),
-                                               **file_filter)
-        #else:
+                input_files = input_files + filter_filenames(
+                    input_folder.glob("*.xlsx"), **file_filter
+                )
+        # else:
         #    raise ValueError(f"Folder {input_folder} does not exist")
         #    raise ValueError(f"Folder {input_folder} does not exist")
     if len(input_files) == 0:
     if len(input_files) == 0:
         raise ValueError(f"No input files found in {country_folders}")
         raise ValueError(f"No input files found in {country_folders}")
@@ -699,7 +788,7 @@ def get_crf_files(
 
 
 
 
 def get_info_from_crf_filename(
 def get_info_from_crf_filename(
-        filename: str,
+    filename: str,
 ) -> dict[str, Union[int, str]]:
 ) -> dict[str, Union[int, str]]:
     """
     """
     Parse given file name and return a dict with information
     Parse given file name and return a dict with information
@@ -707,7 +796,6 @@ def get_info_from_crf_filename(
 
 
     Parameters
     Parameters
     ----------
     ----------
-
     filename: str
     filename: str
         The file to analyze (without path)
         The file to analyze (without path)
 
 
@@ -729,8 +817,7 @@ def get_info_from_crf_filename(
     try:
     try:
         file_info["data_year"] = int(name_parts[2])
         file_info["data_year"] = int(name_parts[2])
     except:
     except:
-        print(f"Data year string {name_parts[2]} "
-              "could not be converted to int.")
+        print(f"Data year string {name_parts[2]} " "could not be converted to int.")
         file_info["data_year"] = name_parts[2]
         file_info["data_year"] = name_parts[2]
     file_info["date"] = name_parts[3]
     file_info["date"] = name_parts[3]
     # the last part (time code) is missing for Australia since 2023
     # the last part (time code) is missing for Australia since 2023
@@ -742,11 +829,11 @@ def get_info_from_crf_filename(
 
 
 
 
 def filter_filenames(
 def filter_filenames(
-        files_to_filter: list[Path],
-        party: Optional[Union[str, list[str]]] = None,
-        data_year: Optional[Union[int, list[int]]] = None,
-        submission_year: Optional[str] = None,
-        date: Optional[str] = None,
+    files_to_filter: list[Path],
+    party: Optional[Union[str, list[str]]] = None,
+    data_year: Optional[Union[int, list[int]]] = None,
+    submission_year: Optional[str] = None,
+    date: Optional[str] = None,
 ) -> list[Path]:
 ) -> list[Path]:
     """Filter a list of filenames of CRF files
     """Filter a list of filenames of CRF files
 
 
@@ -792,8 +879,8 @@ def filter_filenames(
 
 
 
 
 def check_crf_file_info(
 def check_crf_file_info(
-        file_info: dict,
-        file_filter: dict,
+    file_info: dict,
+    file_filter: dict,
 ) -> bool:
 ) -> bool:
     """
     """
     Check if a CRF file has given properties
     Check if a CRF file has given properties
@@ -837,9 +924,9 @@ def check_crf_file_info(
 
 
 
 
 def create_category_tree(
 def create_category_tree(
-        specification: list[list],
-        table: str,
-        country: Optional[str] = None,
+    specification: list[list],
+    table: str,
+    country: Optional[str] = None,
 ) -> Tree:
 ) -> Tree:
     """
     """
     Create a treelib Tree for the categorical hierarchy from a CRF
     Create a treelib Tree for the categorical hierarchy from a CRF
@@ -850,7 +937,6 @@ def create_category_tree(
 
 
     Parameters
     Parameters
     ----------
     ----------
-
     specification: List[List]
     specification: List[List]
         The `sector_mapping` dict of a table specification
         The `sector_mapping` dict of a table specification
 
 
@@ -866,8 +952,10 @@ def create_category_tree(
     """
     """
     # small sanity check on the specification
     # small sanity check on the specification
     if len(specification[0]) < 3:
     if len(specification[0]) < 3:
-        raise ValueError(f"Error: Specification for Table {table} has non-unique "
-                         "categories and need level specifications")
+        raise ValueError(
+            f"Error: Specification for Table {table} has non-unique "
+            "categories and need level specifications"
+        )
 
 
     # initialize variables for tree building
     # initialize variables for tree building
     parent_info = [
     parent_info = [
@@ -888,11 +976,11 @@ def create_category_tree(
     if country is not None:
     if country is not None:
         # remove country tags from categories and mark categories
         # remove country tags from categories and mark categories
         # for other countries for removal
         # for other countries for removal
-        specification = [filter_category(mapping, country)
-                         for mapping in specification]
+        specification = [filter_category(mapping, country) for mapping in specification]
         # remove the categories for other countries
         # remove the categories for other countries
-        specification = [mapping for mapping in specification
-                         if mapping[0] != "\\REMOVE"]
+        specification = [
+            mapping for mapping in specification if mapping[0] != "\\REMOVE"
+        ]
 
 
     # build a tree from specification
     # build a tree from specification
     # when looping over the categories present in the table
     # when looping over the categories present in the table
@@ -903,7 +991,9 @@ def create_category_tree(
         if current_cat_level == last_cat_info["level"]:
         if current_cat_level == last_cat_info["level"]:
             # cat has the same level as preceeding on, so no change to
             # cat has the same level as preceeding on, so no change to
             # parent node
             # parent node
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
         elif current_cat_level == last_cat_info["level"] + 1:
         elif current_cat_level == last_cat_info["level"] + 1:
             # the current category is one level further away from
             # the current category is one level further away from
             # the trunk of the tree. This means that
             # the trunk of the tree. This means that
@@ -913,23 +1003,29 @@ def create_category_tree(
                 {
                 {
                     "id": last_cat_info["id"],
                     "id": last_cat_info["id"],
                     "tag": last_cat_info["category"],
                     "tag": last_cat_info["category"],
-                    "level": last_cat_info["level"]
+                    "level": last_cat_info["level"],
                 }
                 }
             )
             )
             # add the category as new node
             # add the category as new node
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
 
 
         elif current_cat_level < last_cat_info["level"]:
         elif current_cat_level < last_cat_info["level"]:
             # the new level is smaller (closer to the trunk)
             # the new level is smaller (closer to the trunk)
             # than the last one. Thus we remove all parents
             # than the last one. Thus we remove all parents
             # from this level on
             # from this level on
-            parent_info = parent_info[0: current_cat_level + 1]
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            parent_info = parent_info[0 : current_cat_level + 1]
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
         else:
         else:
             # increase in levels of more than one is not allowed
             # increase in levels of more than one is not allowed
-            raise ValueError(f"Error in sector hierarchy for table {table}, category {current_cat}: "
-                             f"Category level is {current_cat_level} and parent level is "
-                             f"{parent_info[-1]['level']}")
+            raise ValueError(
+                f"Error in sector hierarchy for table {table}, category {current_cat}: "
+                f"Category level is {current_cat_level} and parent level is "
+                f"{parent_info[-1]['level']}"
+            )
 
 
         # set last_cat_info
         # set last_cat_info
         last_cat_info["category"] = current_cat
         last_cat_info["category"] = current_cat
@@ -940,8 +1036,8 @@ def create_category_tree(
 
 
 
 
 def filter_category(
 def filter_category(
-        mapping: list,
-        country: str,
+    mapping: list,
+    country: str,
 ) -> list[str]:
 ) -> list[str]:
     """
     """
     This function checks if a category mapping is suitable for the given country.
     This function checks if a category mapping is suitable for the given country.
@@ -975,9 +1071,9 @@ def filter_category(
             new_mapping[0] = "\\REMOVE"
             new_mapping[0] = "\\REMOVE"
         else:
         else:
             re_result = re.search(regex_exclude_full, mapping[0])
             re_result = re.search(regex_exclude_full, mapping[0])
-            new_mapping[0] = mapping[0][len(re_result.group(1)) + 1:]
+            new_mapping[0] = mapping[0][len(re_result.group(1)) + 1 :]
     elif mapping[0].startswith(string_country):
     elif mapping[0].startswith(string_country):
-        new_mapping[0] = mapping[0][len(string_country) + 1:]
+        new_mapping[0] = mapping[0][len(string_country) + 1 :]
     elif re.match(regex_countries, mapping[0]):
     elif re.match(regex_countries, mapping[0]):
         new_mapping[0] = "\\REMOVE"
         new_mapping[0] = "\\REMOVE"
 
 
@@ -985,9 +1081,9 @@ def filter_category(
 
 
 
 
 def get_latest_date_for_country(
 def get_latest_date_for_country(
-        country_code: str,
-        submission_year: int,
-)->str:
+    country_code: str,
+    submission_year: int,
+) -> str:
     """
     """
     Find the latest submission date for a country
     Find the latest submission date for a country
 
 
@@ -1013,26 +1109,36 @@ def get_latest_date_for_country(
         country_folders = folder_mapping[country_code]
         country_folders = folder_mapping[country_code]
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
-            submission_date = find_latest_date(get_submission_dates(
-                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
+            submission_date = find_latest_date(
+                get_submission_dates(
+                    downloaded_data_path_UNFCCC
+                    / country_folders
+                    / f"CRF{submission_year}",
+                    file_filter,
+                )
+            )
         else:
         else:
             dates = []
             dates = []
             for folder in country_folders:
             for folder in country_folders:
-                folder_submission = downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}"
+                folder_submission = (
+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}"
+                )
                 if folder_submission.exists():
                 if folder_submission.exists():
                     dates = dates + get_submission_dates(folder_submission, file_filter)
                     dates = dates + get_submission_dates(folder_submission, file_filter)
             submission_date = find_latest_date(dates)
             submission_date = find_latest_date(dates)
     else:
     else:
-        raise ValueError(f"No data folder found for country {country_code}. "
-                         f"Check if folder mapping is up to date.")
+        raise ValueError(
+            f"No data folder found for country {country_code}. "
+            f"Check if folder mapping is up to date."
+        )
 
 
     return submission_date
     return submission_date
 
 
 
 
 def get_submission_dates(
 def get_submission_dates(
-        folder: Path,
-        file_filter: dict[str, Union[str, int, list]],
-)->list[str]:
+    folder: Path,
+    file_filter: dict[str, Union[str, int, list]],
+) -> list[str]:
     """
     """
     Returns all submission dates available in a folder
     Returns all submission dates available in a folder
 
 
@@ -1050,8 +1156,10 @@ def get_submission_dates(
             List of dates as str
             List of dates as str
     """
     """
     if "date" in file_filter:
     if "date" in file_filter:
-        raise ValueError("'date' present in 'file_filter'. This makes no sense as "
-                         "the function's purpose is to return available dates.")
+        raise ValueError(
+            "'date' present in 'file_filter'. This makes no sense as "
+            "the function's purpose is to return available dates."
+        )
 
 
     if folder.exists():
     if folder.exists():
         files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
         files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
@@ -1065,9 +1173,9 @@ def get_submission_dates(
 
 
 
 
 def get_submission_parties(
 def get_submission_parties(
-        folder: Path,
-        file_filter: dict[str, Union[str, int, list]],
-)->list[str]:
+    folder: Path,
+    file_filter: dict[str, Union[str, int, list]],
+) -> list[str]:
     """
     """
     Returns all submission dates available in a folder
     Returns all submission dates available in a folder
 
 
@@ -1085,8 +1193,10 @@ def get_submission_parties(
             List of parties as str
             List of parties as str
     """
     """
     if "party" in file_filter:
     if "party" in file_filter:
-        raise ValueError("'party' present in 'file_filter'. This makes no sense as "
-                         "the function's purpose is to return available parties.")
+        raise ValueError(
+            "'party' present in 'file_filter'. This makes no sense as "
+            "the function's purpose is to return available parties."
+        )
 
 
     if folder.exists():
     if folder.exists():
         files = filter_filenames(list(folder.glob("*.xlsx")), **file_filter)
         files = filter_filenames(list(folder.glob("*.xlsx")), **file_filter)
@@ -1100,9 +1210,9 @@ def get_submission_parties(
 
 
 
 
 def find_latest_date(
 def find_latest_date(
-        dates: list[str],
-        date_format: str='%d%m%Y',
-)-> str:
+    dates: list[str],
+    date_format: str = "%d%m%Y",
+) -> str:
     """
     """
     Returns the latest date in a list of dates as str in the format
     Returns the latest date in a list of dates as str in the format
     ddmmyyyy
     ddmmyyyy
@@ -1117,11 +1227,11 @@ def find_latest_date(
         str: latest date
         str: latest date
     """
     """
     if len(dates) > 0:
     if len(dates) > 0:
-        dates_datetime = [[date, datetime.strptime(date, date_format)] for date in
-                          dates]
+        dates_datetime = [
+            [date, datetime.strptime(date, date_format)] for date in dates
+        ]
         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
     else:
     else:
         raise ValueError("Passed list of dates is empty")
         raise ValueError("Passed list of dates is empty")
 
 
     return dates_datetime[-1][0]
     return dates_datetime[-1][0]
-

+ 8 - 7
src/unfccc_ghg_data/unfccc_di_reader/read_unfccc_di_for_country.py

@@ -6,12 +6,13 @@ function such that it can be called from datalad
 import argparse
 import argparse
 
 
 from unfccc_ghg_data.unfccc_di_reader.unfccc_di_reader_core import (
 from unfccc_ghg_data.unfccc_di_reader.unfccc_di_reader_core import (
-    read_UNFCCC_DI_for_country)
+    read_UNFCCC_DI_for_country,
+)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser = argparse.ArgumentParser()
-    parser.add_argument('--country', help='Country code')
-    parser.add_argument('--date', help='String with current date')
+    parser.add_argument("--country", help="Country code")
+    parser.add_argument("--date", help="String with current date")
     args = parser.parse_args()
     args = parser.parse_args()
 
 
     country_code = args.country
     country_code = args.country
@@ -19,10 +20,10 @@ if __name__ == "__main__":
 
 
     read_UNFCCC_DI_for_country(
     read_UNFCCC_DI_for_country(
         country_code=country_code,
         country_code=country_code,
-        category_groups=None, # read all categories
-        read_subsectors=False, # not applicable as we read all categories
+        category_groups=None,  # read all categories
+        read_subsectors=False,  # not applicable as we read all categories
         date_str=date_str,
         date_str=date_str,
-        pm2if_specifications=None, # automatically use the right specs for AI and NAI
-        default_gwp=None, # automatically uses right default GWP for AI and NAI
+        pm2if_specifications=None,  # automatically use the right specs for AI and NAI
+        default_gwp=None,  # automatically uses right default GWP for AI and NAI
         debug=False,
         debug=False,
     )
     )

+ 5 - 8
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_bur.py

@@ -5,7 +5,7 @@ Based on `process_bur` from national-inventory-submissions
 (https://github.com/openclimatedata/national-inventory-submisions)
 (https://github.com/openclimatedata/national-inventory-submisions)
 """
 """
 
 
-#import requests
+# import requests
 import re
 import re
 import time
 import time
 from pathlib import Path
 from pathlib import Path
@@ -15,25 +15,24 @@ import pandas as pd
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
-from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
 
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
-
+from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     print("Fetching BUR submissions ...")
     print("Fetching BUR submissions ...")
 
 
     url = "https://unfccc.int/BURs"
     url = "https://unfccc.int/BURs"
 
 
-    #print(url)
+    # print(url)
 
 
     # set options for headless mode
     # set options for headless mode
     profile_path = ".firefox"
     profile_path = ".firefox"
     options = Options()
     options = Options()
-    options.add_argument('-headless')
+    options.add_argument("-headless")
 
 
     # create profile for headless mode and automatic downloading
     # create profile for headless mode and automatic downloading
-    options.set_preference('profile', profile_path)
+    options.set_preference("profile", profile_path)
 
 
     # set up selenium driver
     # set up selenium driver
     driver = Firefox(options=options)
     driver = Firefox(options=options)
@@ -64,7 +63,6 @@ if __name__ == "__main__":
             if str(Path(href).parent).endswith("documents"):
             if str(Path(href).parent).endswith("documents"):
                 targets.append({"title": title, "url": href})
                 targets.append({"title": title, "url": href})
 
 
-
     pattern = re.compile(r"BUR ?\d")
     pattern = re.compile(r"BUR ?\d")
 
 
     # Go through sub-pages.
     # Go through sub-pages.
@@ -79,7 +77,6 @@ if __name__ == "__main__":
         else:
         else:
             no_downloads.append({target["title"], url})
             no_downloads.append({target["title"], url})
 
 
-
     if len(no_downloads) > 0:
     if len(no_downloads) > 0:
         print("No downloads for ", no_downloads)
         print("No downloads for ", no_downloads)
 
 

+ 3 - 7
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_nc.py

@@ -19,21 +19,20 @@ from selenium.webdriver.firefox.options import Options
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
 from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
 
-
 if __name__ == "__main__":
 if __name__ == "__main__":
     print("Fetching NC submissions ...")
     print("Fetching NC submissions ...")
 
 
     url = "https://unfccc.int/non-annex-I-NCs"
     url = "https://unfccc.int/non-annex-I-NCs"
 
 
-    #print(url)
+    # print(url)
 
 
     # set options for headless mode
     # set options for headless mode
     profile_path = ".firefox"
     profile_path = ".firefox"
     options = Options()
     options = Options()
-    options.add_argument('-headless')
+    options.add_argument("-headless")
 
 
     # create profile for headless mode and automatic downloading
     # create profile for headless mode and automatic downloading
-    options.set_preference('profile', profile_path)
+    options.set_preference("profile", profile_path)
 
 
     # set up selenium driver
     # set up selenium driver
     driver = Firefox(options=options)
     driver = Firefox(options=options)
@@ -64,10 +63,8 @@ if __name__ == "__main__":
             if str(Path(href).parent).endswith("documents"):
             if str(Path(href).parent).endswith("documents"):
                 targets.append({"title": title, "url": href})
                 targets.append({"title": title, "url": href})
 
 
-
     pattern = re.compile(r"NC ?\d")
     pattern = re.compile(r"NC ?\d")
 
 
-
     # Go through sub-pages.
     # Go through sub-pages.
     for target in targets:
     for target in targets:
         time.sleep(randrange(5, 15))
         time.sleep(randrange(5, 15))
@@ -80,7 +77,6 @@ if __name__ == "__main__":
         else:
         else:
             no_downloads.append({target["title"], url})
             no_downloads.append({target["title"], url})
 
 
-
     if len(no_downloads) > 0:
     if len(no_downloads) > 0:
         print("No downloads for ", no_downloads)
         print("No downloads for ", no_downloads)
 
 

+ 27 - 4
src/unfccc_ghg_data/unfccc_reader/Argentina/__init__.py

@@ -1,7 +1,30 @@
-"""Argentina (BUR4)
+"""Read Argentina's BURs, NIRs, NCs
 
 
 Scripts and configurations to read Argentina's submissions to the UNFCCC.
 Scripts and configurations to read Argentina's submissions to the UNFCCC.
-Currently code for the following submissions is available:
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
 
 
-* BUR4 (from pdf)
-"""
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'ARG'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=ARG
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 115 - 96
src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR4_from_pdf.py

@@ -2,16 +2,22 @@
 Read Argentina's BUR4 from pdf
 Read Argentina's BUR4 from pdf
 
 
 This script reads data from Argentina's fourth Binnial Update Report (BUR4).
 This script reads data from Argentina's fourth Binnial Update Report (BUR4).
- Data is read from the pdf file using camelot"""
+Data is read from the pdf file using camelot
+"""
 
 
+import os
 import sys
 import sys
 
 
 import camelot
 import camelot
 import primap2 as pm2
 import primap2 as pm2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
 
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-from unfccc_ghg_data.helper import gas_baskets, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
+)
 
 
 # ###
 # ###
 # configuration
 # configuration
@@ -21,53 +27,49 @@ from unfccc_ghg_data.helper import gas_baskets, process_data_for_country
 #  PRIMAP2 version
 #  PRIMAP2 version
 if __name__ == "__main__":
 if __name__ == "__main__":
     # folders and files
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
-                   'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
+    input_folder = downloaded_data_path / "UNFCCC" / "Argentina" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Argentina"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'ARG_BUR4_2022_'
+    output_filename = "ARG_BUR4_2022_"
+
+    pdf_file = "4to_Informe_Bienal_de_la_Rep%C3%BAblica_Argentina.pdf"
 
 
-    pdf_file = '4to_Informe_Bienal_de_la_Rep%C3%BAblica_Argentina.pdf'
+    # definitions part 1: reading data from pdf and preprocessing for conversion to
+    # PRIMAP2 format
 
 
-    # definitions part 1: reading data from pdf and preprocessing for conversion to PRIMAP2 format
     # part 1.1 KyotoGHG, CO2, CH4, N2O tables
     # part 1.1 KyotoGHG, CO2, CH4, N2O tables
     #
     #
     pages_to_read = range(232, 244)
     pages_to_read = range(232, 244)
     data_start_keyword = "Id#"
     data_start_keyword = "Id#"
     data_end_keyword = "Fuente: Elaboración propia"
     data_end_keyword = "Fuente: Elaboración propia"
-    index_cols = ['Id#', 'Nombre']
-    col_rename = {
-        index_cols[0]: "category",
-        index_cols[1]: "orig_cat_name"
-    }
-    metadata = {
-        "entity": [0, 1],
-        "unit": [0, 2]
-    }
+    index_cols = ["Id#", "Nombre"]
+    col_rename = {index_cols[0]: "category", index_cols[1]: "orig_cat_name"}
+    metadata = {"entity": [0, 1], "unit": [0, 2]}
 
 
     rows_to_drop = [0]
     rows_to_drop = [0]
 
 
     metadata_mapping = {
     metadata_mapping = {
-        'unit': {
-            '(GgCO2e)': 'GgCO2e',
-            '(GgCO2)': 'Gg',
-            '(GgN2O)': 'Gg',
-            '(GgCH4)': 'Gg',
-            '(GgGas)': 'Gg',
+        "unit": {
+            "(GgCO2e)": "GgCO2e",
+            "(GgCO2)": "Gg",
+            "(GgN2O)": "Gg",
+            "(GgCH4)": "Gg",
+            "(GgGas)": "Gg",
         }
         }
     }
     }
 
 
     # part 1.2: fgases table
     # part 1.2: fgases table
-    # the f-gases table is in wide format with no sectoral resolution and gases as row header
+    # the f-gases table is in wide format with no sectoral resolution and gases as row
+    # header
     pages_to_read_fgases = range(244, 247)
     pages_to_read_fgases = range(244, 247)
     data_start_keyword_fgases = "Gas"
     data_start_keyword_fgases = "Gas"
-    index_cols_fgases = ['Gas']
+    index_cols_fgases = ["Gas"]
     cols_to_drop_fgases = ["Nombre"]
     cols_to_drop_fgases = ["Nombre"]
     metadata_fgases = {
     metadata_fgases = {
         "unit": [0, 2],
         "unit": [0, 2],
-        "category": '2',
+        "category": "2",
         "orig_cat_name": "PROCESOS INDUSTRIALES Y USO DE PRODUCTOS",
         "orig_cat_name": "PROCESOS INDUSTRIALES Y USO DE PRODUCTOS",
     }
     }
     col_rename_fgases = {
     col_rename_fgases = {
@@ -79,14 +81,14 @@ if __name__ == "__main__":
     cats_remove = ["Information Items", "Memo Items (3)"]
     cats_remove = ["Information Items", "Memo Items (3)"]
     # manual category codes
     # manual category codes
     cat_codes_manual = {  # conversion to PRIMAP1 format
     cat_codes_manual = {  # conversion to PRIMAP1 format
-        '1A6': 'MBIO',
-        '1A3di': 'MBKM',
-        '1A3ai': 'MBKA',
-        '1A3di Navegación marítima y fluvial internacional': 'MBKM',
-        'S/N': 'MMULTIOP',
+        "1A6": "MBIO",
+        "1A3di": "MBKM",
+        "1A3ai": "MBKA",
+        "1A3di Navegación marítima y fluvial internacional": "MBKM",
+        "S/N": "MMULTIOP",
     }
     }
 
 
-    cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,8}).*'
+    cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,8}).*"
 
 
     time_format = "%Y"
     time_format = "%Y"
 
 
@@ -116,32 +118,32 @@ if __name__ == "__main__":
     coords_value_mapping = {
     coords_value_mapping = {
         #    "category": "PRIMAP1",
         #    "category": "PRIMAP1",
         "entity": {
         "entity": {
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
-            'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'HFC-125': 'HFC125',
-            'HFC-134': 'HFC134',
-            'HFC-134a': 'HFC134a',
-            'HFC-152a': 'HFC152a',
-            'HFC-143': 'HFC143',
-            'HFC-143a': 'HFC143a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-236fa': 'HFC236fa',
-            'HFC-245ca': 'HFC245ca',
-            'HFC-365mfc': 'HFC365mfc',
-            'HFC-245fa': 'HFC245fa',
-            'PFC-143 (CF4)': 'CF4',
-            'PFC-116 (C2F6)': 'C2F6',
-            'PFC-218 (C3F8)': 'C3F8',
-            'PFC-31-10 (C4F10)': 'C4F10',
-            'c-C4F8': 'cC4F8',
-            'PFC-51-144 (C6F14)': 'C6F14',
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
+            "HFC-134": "HFC134",
+            "HFC-134a": "HFC134a",
+            "HFC-152a": "HFC152a",
+            "HFC-143": "HFC143",
+            "HFC-143a": "HFC143a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236fa": "HFC236fa",
+            "HFC-245ca": "HFC245ca",
+            "HFC-365mfc": "HFC365mfc",
+            "HFC-245fa": "HFC245fa",
+            "PFC-143 (CF4)": "CF4",
+            "PFC-116 (C2F6)": "C2F6",
+            "PFC-218 (C3F8)": "C3F8",
+            "PFC-31-10 (C4F10)": "C4F10",
+            "c-C4F8": "cC4F8",
+            "PFC-51-144 (C6F14)": "C6F14",
         },
         },
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "orig_cat_name": {
         "orig_cat_name": {
             "1A3di Navegación marítima y fluvial internacional": "Navegación marítima y fluvial internacional",
             "1A3di Navegación marítima y fluvial internacional": "Navegación marítima y fluvial internacional",
-        }
+        },
     }
     }
 
 
     coords_value_filling = {
     coords_value_filling = {
@@ -172,7 +174,8 @@ if __name__ == "__main__":
         "references": "https://unfccc.int/documents/419772",
         "references": "https://unfccc.int/documents/419772",
         "rights": "XXXX",
         "rights": "XXXX",
         "contact": "mail@johannes-guetschow.de",
         "contact": "mail@johannes-guetschow.de",
-        "title": "Cuarto Informe Bienal de Actualización de la República Argentina a la Convención Marco delas Naciones Unidas Sobre el Cambio Climático",
+        "title": "Cuarto Informe Bienal de Actualización de la República Argentina a "
+        "la Convención Marco delas Naciones Unidas Sobre el Cambio Climático",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
     }
@@ -192,8 +195,9 @@ if __name__ == "__main__":
     data_all = None
     data_all = None
     for page in pages_to_read:
     for page in pages_to_read:
         # read current page
         # read current page
-        tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page),
-                                  flavor='stream')
+        tables = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=str(page), flavor="stream"
+        )
         df_current = tables[0].df
         df_current = tables[0].df
         rows_to_drop = []
         rows_to_drop = []
         for index, data in df_current.iterrows():
         for index, data in df_current.iterrows():
@@ -212,16 +216,18 @@ if __name__ == "__main__":
         df_current = df_current.drop(rows_to_drop)
         df_current = df_current.drop(rows_to_drop)
         idx_header = df_current.index[df_current[0] == index_cols[0]].tolist()
         idx_header = df_current.index[df_current[0] == index_cols[0]].tolist()
         df_current = df_current.rename(
         df_current = df_current.rename(
-            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1)
+            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1
+        )
         df_current = df_current.drop(idx_header)
         df_current = df_current.drop(idx_header)
 
 
         # for sheet "Aggregate GHGs" fill entity cell
         # for sheet "Aggregate GHGs" fill entity cell
         if page in range(232, 235):
         if page in range(232, 235):
             df_current.iloc[
             df_current.iloc[
-                metadata["entity"][0], metadata["entity"][1]] = "KYOTOGHG (SARGWP100)"
+                metadata["entity"][0], metadata["entity"][1]
+            ] = "KYOTOGHG (SARGWP100)"
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # df_current = df_current.set_index(index_cols)
         # add columns
         # add columns
@@ -242,21 +248,27 @@ if __name__ == "__main__":
         df_current = df_current.drop(df_current.index[0])
         df_current = df_current.drop(df_current.index[0])
 
 
         # fix number format
         # fix number format
-        df_current = df_current.apply(lambda x: x.str.replace('.', '', regex=False), axis=1)
-        df_current = df_current.apply(lambda x: x.str.replace(',', '.', regex=False),
-                                      axis=1)
+        df_current = df_current.apply(
+            lambda x: x.str.replace(".", "", regex=False), axis=1
+        )
+        df_current = df_current.apply(
+            lambda x: x.str.replace(",", ".", regex=False), axis=1
+        )
 
 
-        df_current.rename(columns=col_rename, inplace=True)
+        df_current = df_current.rename(columns=col_rename)
 
 
         # reindex
         # reindex
-        df_current.reset_index(inplace=True, drop=True)
+        df_current = df_current.reset_index(drop=True)
 
 
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
+
         # then the regex replacements
         # then the regex replacements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_current["category"] = df_current["category"].str.replace(cat_code_regexp, repl,
-                                                                    regex=True)
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_current["category"] = df_current["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
 
         df_current = df_current.reset_index(drop=True)
         df_current = df_current.reset_index(drop=True)
 
 
@@ -274,7 +286,7 @@ if __name__ == "__main__":
             coords_value_filling=coords_value_filling,
             coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             filter_remove=filter_remove,
             filter_keep=filter_keep,
             filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
         )
 
 
         # convert to PRIMAP2 native format
         # convert to PRIMAP2 native format
@@ -289,8 +301,9 @@ if __name__ == "__main__":
     # read fgases
     # read fgases
     for page in pages_to_read_fgases:
     for page in pages_to_read_fgases:
         # read current page
         # read current page
-        tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page),
-                                  flavor='stream')
+        tables = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=str(page), flavor="stream"
+        )
         df_current = tables[0].df
         df_current = tables[0].df
         rows_to_drop = []
         rows_to_drop = []
         for index, data in df_current.iterrows():
         for index, data in df_current.iterrows():
@@ -309,11 +322,12 @@ if __name__ == "__main__":
         df_current = df_current.drop(rows_to_drop)
         df_current = df_current.drop(rows_to_drop)
         idx_header = df_current.index[df_current[0] == index_cols_fgases[0]].tolist()
         idx_header = df_current.index[df_current[0] == index_cols_fgases[0]].tolist()
         df_current = df_current.rename(
         df_current = df_current.rename(
-            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1)
+            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1
+        )
         df_current = df_current.drop(idx_header)
         df_current = df_current.drop(idx_header)
 
 
         # drop all rows where the index cols (category code
         # drop all rows where the index cols (category code
-        df_current.dropna(axis=0, how='all', subset=index_cols_fgases, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols_fgases)
         # set index. necessary for the stack operation in the conversion to long format
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # df_current = df_current.set_index(index_cols)
         # add columns
         # add columns
@@ -324,7 +338,8 @@ if __name__ == "__main__":
                 value = metadata_fgases[col]
                 value = metadata_fgases[col]
             else:
             else:
                 value = df_current.iloc[
                 value = df_current.iloc[
-                    metadata_fgases[col][0], metadata_fgases[col][1] + inserted]
+                    metadata_fgases[col][0], metadata_fgases[col][1] + inserted
+                ]
                 if col in metadata_mapping.keys():
                 if col in metadata_mapping.keys():
                     if value in metadata_mapping[col].keys():
                     if value in metadata_mapping[col].keys():
                         value = metadata_mapping[col][value]
                         value = metadata_mapping[col][value]
@@ -339,21 +354,27 @@ if __name__ == "__main__":
         df_current = df_current.drop(df_current.index[0])
         df_current = df_current.drop(df_current.index[0])
 
 
         # fix number format
         # fix number format
-        df_current = df_current.apply(lambda x: x.str.replace('.', '', regex=False), axis=1)
-        df_current = df_current.apply(lambda x: x.str.replace(',', '.', regex=False),
-                                      axis=1)
+        df_current = df_current.apply(
+            lambda x: x.str.replace(".", "", regex=False), axis=1
+        )
+        df_current = df_current.apply(
+            lambda x: x.str.replace(",", ".", regex=False), axis=1
+        )
 
 
-        df_current.rename(columns=col_rename_fgases, inplace=True)
+        df_current = df_current.rename(columns=col_rename_fgases)
 
 
         # reindex
         # reindex
-        df_current.reset_index(inplace=True, drop=True)
+        df_current = df_current.reset_index(drop=True)
 
 
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
-        # then the regex repalcements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_current["category"] = df_current["category"].str.replace(cat_code_regexp, repl,
-                                                                    regex=True)
+
+        # then the regex replacements
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_current["category"] = df_current["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
 
         df_current = df_current.reset_index(drop=True)
         df_current = df_current.reset_index(drop=True)
 
 
@@ -371,7 +392,7 @@ if __name__ == "__main__":
             coords_value_filling=coords_value_filling,
             coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             filter_remove=filter_remove,
             filter_keep=filter_keep,
             filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
         )
 
 
         # convert to PRIMAP2 native format
         # convert to PRIMAP2 native format
@@ -390,19 +411,17 @@ if __name__ == "__main__":
         processing_info_country=None,
         processing_info_country=None,
     )
     )
 
 
-
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
     # ###
     # ###
 
 
     encoding = {var: compression for var in data_all.data_vars}
     encoding = {var: compression for var in data_all.data_vars}
-    data_all.pr.to_netcdf(output_folder / (output_filename + coords_terminologies[
-        "category"] + ".nc"), encoding=encoding)
+    data_all.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     data_if = data_all.pr.to_interchange_format()
     data_if = data_all.pr.to_interchange_format()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
-
-
-
-
-
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )

+ 28 - 5
src/unfccc_ghg_data/unfccc_reader/Chile/__init__.py

@@ -1,7 +1,30 @@
-"""Chile (BUR4, BUR5)
+"""Read Chile's BURs, NIRs, NCs
 
 
-Scripts and configurations to read Chile's is available:
- * BUR4 (from xlsx)
- * BUR5 (from xlsx)
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
 
 
-"""
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'CHL'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=CHL
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 288 - 141
src/unfccc_ghg_data/unfccc_reader/Chile/config_chl_bur4.py

@@ -1,166 +1,295 @@
+"""Config for Chile BUR4,5
+
+General configuration for reading the inventory files underlying Chile's BURs 4 and 5.
+PRIMAP2 data for mat specific configuration is BUR specific and not contained here
+but in the reading scripts.
+
+"""
+
 ## parameters for conversion to IPCC2006 categories
 ## parameters for conversion to IPCC2006 categories
 filter_remove_IPCC2006 = {
 filter_remove_IPCC2006 = {
-    "filter_cats": { # filter cats that have no 1:1 match for IPCC2006 or are additional subsectors
+    "filter_cats": {  # filter cats that have no 1:1 match for IPCC2006 or are additional subsectors
         "category (IPCC2006_PRIMAP)": [
         "category (IPCC2006_PRIMAP)": [
             # refrigeration and air conditioning subsectors don't match IPCC2006
             # refrigeration and air conditioning subsectors don't match IPCC2006
-            '2.F.1.a', '2.F.1.b', '2.F.1.c', '2.F.1.d', '2.F.1.e', '2.F.1.f',
+            "2.F.1.a",
+            "2.F.1.b",
+            "2.F.1.c",
+            "2.F.1.d",
+            "2.F.1.e",
+            "2.F.1.f",
             # additional subsectors for other cattle in enteric fermentation
             # additional subsectors for other cattle in enteric fermentation
-            '3.A.1.b.i', '3.A.1.b.ii', '3.A.1.b.iii', '3.A.1.b.iv', '3.A.1.b.v',
+            "3.A.1.b.i",
+            "3.A.1.b.ii",
+            "3.A.1.b.iii",
+            "3.A.1.b.iv",
+            "3.A.1.b.v",
             # additional subcategories for swine in enteric fermentation
             # additional subcategories for swine in enteric fermentation
-            '3.A.3.a', '3.A.3.b', '3.A.3.c',
+            "3.A.3.a",
+            "3.A.3.b",
+            "3.A.3.c",
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
-            '3.A.4',
+            "3.A.4",
             # need to be summed to '3.A.4.j'
             # need to be summed to '3.A.4.j'
-            '3.A.4.f', '3.A.4.g', '3.A.4.g.i', '3.A.4.g.ii',
+            "3.A.4.f",
+            "3.A.4.g",
+            "3.A.4.g.i",
+            "3.A.4.g.ii",
             # additional subsectors for other cattle in enteric fermentation
             # additional subsectors for other cattle in enteric fermentation
-            '3.B.1.b.i', '3.B.1.b.ii', '3.B.1.b.iii', '3.B.1.b.iv', '3.B.1.b.v',
+            "3.B.1.b.i",
+            "3.B.1.b.ii",
+            "3.B.1.b.iii",
+            "3.B.1.b.iv",
+            "3.B.1.b.v",
             # additional subcategories for swine in enteric fermentation
             # additional subcategories for swine in enteric fermentation
-            '3.B.3.a', '3.B.3.b', '3.B.3.c',
+            "3.B.3.a",
+            "3.B.3.b",
+            "3.B.3.c",
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
-            '3.B.4',
+            "3.B.4",
             # need to be summed to '3.A.4.j'
             # need to be summed to '3.A.4.j'
-            '3.B.4.f', '3.B.4.g', '3.B.4.g.i', '3.B.4.g.ii',
+            "3.B.4.f",
+            "3.B.4.g",
+            "3.B.4.g.i",
+            "3.B.4.g.ii",
             # subsectors of indirect N2O from manure management
             # subsectors of indirect N2O from manure management
-            '3.B.5.a', '3.B.5.b', '3.B.5.c', '3.B.5.d', '3.B.5.d.i', '3.B.5.d.ii',
-            '3.B.5.d.iii', '3.B.5.d.iv', '3.B.5.d.v', '3.B.5.d.vi', '3.B.5.d.vii',
+            "3.B.5.a",
+            "3.B.5.b",
+            "3.B.5.c",
+            "3.B.5.d",
+            "3.B.5.d.i",
+            "3.B.5.d.ii",
+            "3.B.5.d.iii",
+            "3.B.5.d.iv",
+            "3.B.5.d.v",
+            "3.B.5.d.vi",
+            "3.B.5.d.vii",
             # subsectors of rice cultivation
             # subsectors of rice cultivation
-            '3.C.1', '3.C.2', '3.C.3', '3.C.4',
+            "3.C.1",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
             # no direct represenation of "agricultural soils" in IPCC 2006
             # no direct represenation of "agricultural soils" in IPCC 2006
-            '3.D',
+            "3.D",
             # subsectors of 3.D.1. not matching subsectors of 3.C.4 (direct emissions from managed soils)
             # subsectors of 3.D.1. not matching subsectors of 3.C.4 (direct emissions from managed soils)
             # '3.D.1.a.': '3.C.1.a', '3.D.1.b.': '3.C.1.b', '3.D.1.c.': '3.A.4.c', '3.D.1.d.': '3.C.4.d',
             # '3.D.1.a.': '3.C.1.a', '3.D.1.b.': '3.C.1.b', '3.D.1.c.': '3.A.4.c', '3.D.1.d.': '3.C.4.d',
-            '3.D.1.a', '3.D.1.b', '3.D.1.b.i', '3.D.1.b.ii', '3.D.1.b.iii', '3.D.1.c',
-            '3.D.1.d', '3.D.1.e', '3.D.1.f', '3.D.1.g',
+            "3.D.1.a",
+            "3.D.1.b",
+            "3.D.1.b.i",
+            "3.D.1.b.ii",
+            "3.D.1.b.iii",
+            "3.D.1.c",
+            "3.D.1.d",
+            "3.D.1.e",
+            "3.D.1.f",
+            "3.D.1.g",
             # additional subsector level of 3.D.2.a (3.C.5.a Atmospheric deposition)
             # additional subsector level of 3.D.2.a (3.C.5.a Atmospheric deposition)
-            '3.D.2.a.i', '3.D.2.a.ii', '3.D.2.a.ii.1', '3.D.2.a.ii.2', '3.D.2.a.ii.3', '3.D.2.a.iii',
+            "3.D.2.a.i",
+            "3.D.2.a.ii",
+            "3.D.2.a.ii.1",
+            "3.D.2.a.ii.2",
+            "3.D.2.a.ii.3",
+            "3.D.2.a.iii",
             # additional subsector level of 3.D.2.b (3.C.5.b Nitrongen leaching and runoff)
             # additional subsector level of 3.D.2.b (3.C.5.b Nitrongen leaching and runoff)
-            '3.D.2.b.i', '3.D.2.b.ii', '3.D.2.b.ii.1', '3.D.2.b.ii.2', '3.D.2.b.ii.3', '3.D.2.b.iii',
-            '3.D.2.b.iv', '3.D.2.b.v',
+            "3.D.2.b.i",
+            "3.D.2.b.ii",
+            "3.D.2.b.ii.1",
+            "3.D.2.b.ii.2",
+            "3.D.2.b.ii.3",
+            "3.D.2.b.iii",
+            "3.D.2.b.iv",
+            "3.D.2.b.v",
             # additional subsector level of 3.F (3.C.1.b Biomass burning in cropland)
             # additional subsector level of 3.F (3.C.1.b Biomass burning in cropland)
-            '3.F.1', '3.F.2', '3.F.3',
+            "3.F.1",
+            "3.F.2",
+            "3.F.3",
             # additional subsector level of 3.G (3.C.2 Liming)
             # additional subsector level of 3.G (3.C.2 Liming)
-            '3.G.1', '3.G.2',
+            "3.G.1",
+            "3.G.2",
             # additional subsector levels of 4.A.1 (3.A.1.a Forest land remaining forest land)
             # additional subsector levels of 4.A.1 (3.A.1.a Forest land remaining forest land)
-            '4.A.1.a', '4.A.1.a.i', '4.A.1.a.i.1', '4.A.1.a.i.1.a', '4.A.1.a.i.1.b', '4.A.1.a.i.1.c',
-            '4.A.1.a.i.1.d', '4.A.1.a.i.1.e', '4.A.1.a.i.1.f', '4.A.1.a.i.1.g', '4.A.1.a.i.1.h',
-            '4.A.1.a.i.1.i', '4.A.1.a.i.1.j', '4.A.1.a.i.1.k', '4.A.1.a.i.1.l', '4.A.1.a.i.2',
-            '4.A.1.a.i.2.a', '4.A.1.a.i.2.b', '4.A.1.a.i.2.c', '4.A.1.a.i.2.d', '4.A.1.a.i.2.e',
-            '4.A.1.a.i.2.f', '4.A.1.a.i.2.g', '4.A.1.a.i.2.h', '4.A.1.a.i.2.i', '4.A.1.a.i.2.j',
-            '4.A.1.a.i.2.k', '4.A.1.a.i.2.l', '4.A.1.a.i.3', '4.A.1.a.i.3.a', '4.A.1.a.i.3.b',
-            '4.A.1.a.i.3.c', '4.A.1.a.i.3.d', '4.A.1.a.i.3.e', '4.A.1.a.i.3.f', '4.A.1.a.i.3.g',
-            '4.A.1.a.i.3.h', '4.A.1.a.i.3.i', '4.A.1.a.i.3.j', '4.A.1.a.i.3.k', '4.A.1.a.i.3.l',
-            '4.A.1.a.ii', '4.A.1.a.ii.1', '4.A.1.a.ii.2', '4.A.1.a.ii.3', '4.A.1.a.ii.4',
-            '4.A.1.a.ii.5', '4.A.1.a.ii.6', '4.A.1.a.ii.7', '4.A.1.b', '4.A.1.b.i', '4.A.1.b.i.1',
-            '4.A.1.b.i.2', '4.A.1.b.i.3', '4.A.1.b.i.4', '4.A.1.b.ii', '4.A.1.b.ii.1', '4.A.1.b.ii.2',
-            '4.A.1.b.iii', '4.A.1.b.iii.1', '4.A.1.b.iii.1.a', '4.A.1.b.iii.1.b', '4.A.1.b.iii.2',
-            '4.A.1.b.iv', '4.A.1.c', '4.A.1.c.i', '4.A.1.c.ii',
+            "4.A.1.a",
+            "4.A.1.a.i",
+            "4.A.1.a.i.1",
+            "4.A.1.a.i.1.a",
+            "4.A.1.a.i.1.b",
+            "4.A.1.a.i.1.c",
+            "4.A.1.a.i.1.d",
+            "4.A.1.a.i.1.e",
+            "4.A.1.a.i.1.f",
+            "4.A.1.a.i.1.g",
+            "4.A.1.a.i.1.h",
+            "4.A.1.a.i.1.i",
+            "4.A.1.a.i.1.j",
+            "4.A.1.a.i.1.k",
+            "4.A.1.a.i.1.l",
+            "4.A.1.a.i.2",
+            "4.A.1.a.i.2.a",
+            "4.A.1.a.i.2.b",
+            "4.A.1.a.i.2.c",
+            "4.A.1.a.i.2.d",
+            "4.A.1.a.i.2.e",
+            "4.A.1.a.i.2.f",
+            "4.A.1.a.i.2.g",
+            "4.A.1.a.i.2.h",
+            "4.A.1.a.i.2.i",
+            "4.A.1.a.i.2.j",
+            "4.A.1.a.i.2.k",
+            "4.A.1.a.i.2.l",
+            "4.A.1.a.i.3",
+            "4.A.1.a.i.3.a",
+            "4.A.1.a.i.3.b",
+            "4.A.1.a.i.3.c",
+            "4.A.1.a.i.3.d",
+            "4.A.1.a.i.3.e",
+            "4.A.1.a.i.3.f",
+            "4.A.1.a.i.3.g",
+            "4.A.1.a.i.3.h",
+            "4.A.1.a.i.3.i",
+            "4.A.1.a.i.3.j",
+            "4.A.1.a.i.3.k",
+            "4.A.1.a.i.3.l",
+            "4.A.1.a.ii",
+            "4.A.1.a.ii.1",
+            "4.A.1.a.ii.2",
+            "4.A.1.a.ii.3",
+            "4.A.1.a.ii.4",
+            "4.A.1.a.ii.5",
+            "4.A.1.a.ii.6",
+            "4.A.1.a.ii.7",
+            "4.A.1.b",
+            "4.A.1.b.i",
+            "4.A.1.b.i.1",
+            "4.A.1.b.i.2",
+            "4.A.1.b.i.3",
+            "4.A.1.b.i.4",
+            "4.A.1.b.ii",
+            "4.A.1.b.ii.1",
+            "4.A.1.b.ii.2",
+            "4.A.1.b.iii",
+            "4.A.1.b.iii.1",
+            "4.A.1.b.iii.1.a",
+            "4.A.1.b.iii.1.b",
+            "4.A.1.b.iii.2",
+            "4.A.1.b.iv",
+            "4.A.1.c",
+            "4.A.1.c.i",
+            "4.A.1.c.ii",
             # additional subsector level in land converted to forest land
             # additional subsector level in land converted to forest land
-            '4.A.2.a.i', '4.A.2.a.ii', '4.A.2.b.i', '4.A.2.b.ii', '4.A.2.c.i', '4.A.2.c.ii',
-            '4.A.2.d.i', '4.A.2.d.ii', '4.A.2.e.i', '4.A.2.e.ii',
+            "4.A.2.a.i",
+            "4.A.2.a.ii",
+            "4.A.2.b.i",
+            "4.A.2.b.ii",
+            "4.A.2.c.i",
+            "4.A.2.c.ii",
+            "4.A.2.d.i",
+            "4.A.2.d.ii",
+            "4.A.2.e.i",
+            "4.A.2.e.ii",
             # subsectors of solid waste disposal might not match
             # subsectors of solid waste disposal might not match
-            '5.A.1', '5.A.2', '5.A.3',
+            "5.A.1",
+            "5.A.2",
+            "5.A.3",
         ],
         ],
     },
     },
 }
 }
 
 
 
 
-cat_mapping = { # categories not listed here have the same UNFCCC_GHG_data as in IPCC 2006 specifications
-    '3': 'M.AG',
-    '3.A': '3.A.1',
-    '3.A.1': '3.A.1.a',
-    '3.A.1.a': '3.A.1.a.i',
-    '3.A.1.b': '3.A.1.a.ii',
-    '3.A.2': '3.A.1.c',
-    '3.A.3': '3.A.1.h',
-    '3.A.4.a': '3.A.1.b',
-    '3.A.4.b': '3.A.1.d',
-    '3.A.4.c': '3.A.1.f',
-    '3.A.4.d': '3.A.1.g',
-    '3.A.4.e': '3.A.1.i',
-    '3.B': '3.A.2',
-    '3.B.1': '3.A.2.a',
-    '3.B.1.a': '3.A.2.a.i',
-    '3.B.1.b': '3.A.2.a.ii',
-    '3.B.2': '3.A.2.c',
-    '3.B.3': '3.A.2.h',
-    '3.B.4.a': '3.A.2.b',
-    '3.B.4.b': '3.A.2.d',
-    '3.B.4.c': '3.A.2.f',
-    '3.B.4.d': '3.A.2.g',
-    '3.B.4.e': '3.A.2.i',
-    '3.B.5': '3.C.6',
-    '3.C': '3.C.7',
-    '3.D.1': '3.C.4',
-    '3.D.2': '3.C.5',
-    '3.D.2.a': '3.C.5.a', # not in climate_categories
-    '3.D.2.b': '3.C.5.b', # not in climate_categories
-    '3.E': '3.C.1.c',
-    '3.F': '3.C.1.b',
-    '3.G': '3.C.2',
-    '3.H': '3.C.3',
-    '3.I': '3.C.8.a', # merge this with cat below
-    '3.J': '3.C.8.b', # merge with cat above
-    '4': 'M.LULUCF',
-    '4.A': '3.B.1',
-    '4.A.1': '3.B.1.a',
-    '4.A.2': '3.B.1.b',
-    '4.A.2.a': '3.B.1.b.i',
-    '4.A.2.b': '3.B.1.b.ii',
-    '4.A.2.c': '3.B.1.b.iii',
-    '4.A.2.d': '3.B.1.b.iv',
-    '4.A.2.e': '3.B.1.b.v',
-    '4.B': '3.B.2',
-    '4.B.1': '3.B.2.a',
-    '4.B.2': '3.B.2.b',
-    '4.B.2.a': '3.B.2.b.i',
-    '4.B.2.b': '3.B.2.b.ii',
-    '4.B.2.c': '3.B.2.b.iii',
-    '4.B.2.d': '3.B.2.b.iv',
-    '4.B.2.e': '3.B.2.b.v',
-    '4.C': '3.B.3',
-    '4.C.1': '3.B.3.a',
-    '4.C.2': '3.B.3.b',
-    '4.C.2.a': '3.B.3.b.i',
-    '4.C.2.b': '3.B.3.b.ii',
-    '4.C.2.c': '3.B.3.b.iii',
-    '4.C.2.d': '3.B.3.b.iv',
-    '4.C.2.e': '3.B.3.b.v',
-    '4.D': '3.B.4',
-    '4.D.1': '3.B.4.a',
-    '4.D.2': '3.B.4.b',
-    '4.D.2.a': '3.B.4.b.i',
-    '4.D.2.b': '3.B.4.b.ii',
-    '4.D.2.c': '3.B.4.b.iii',
-    '4.D.2.d': '3.B.4.b.iv',
-    '4.D.2.e': '3.B.4.b.v',
-    '4.E': '3.B.5',
-    '4.E.1': '3.B.5.a',
-    '4.E.2': '3.B.5.b',
-    '4.E.2.a': '3.B.5.b.i',
-    '4.E.2.b': '3.B.5.b.ii',
-    '4.E.2.c': '3.B.5.b.iii',
-    '4.E.2.d': '3.B.5.b.iv',
-    '4.E.2.e': '3.B.5.b.v',
-    '4.F': '3.B.6',
-    '4.F.1': '3.B.6.a',
-    '4.F.2': '3.B.6.b',
-    '4.F.2.a': '3.B.6.b.i',
-    '4.F.2.b': '3.B.6.b.ii',
-    '4.F.2.c': '3.B.6.b.iii',
-    '4.F.2.d': '3.B.6.b.iv',
-    '4.F.2.e': '3.B.6.b.v',
-    '4.G': '3.D.1',
-    '4.H': '3.D.2',
-    '5': '4',
-    '5.A': '4.A',
-    '5.B': '4.B',
-    '5.C': '4.C',
-    '5.C.1': '4.C.1',
-    '5.C.2': '4.C.2',
-    '5.D': '4.D',
-    '5.D.1': '4.D.1',
-    '5.D.2': '4.D.2',
-    '5.E': '4.E',
+cat_mapping = {  # categories not listed here have the same UNFCCC_GHG_data as in IPCC 2006 specifications
+    "3": "M.AG",
+    "3.A": "3.A.1",
+    "3.A.1": "3.A.1.a",
+    "3.A.1.a": "3.A.1.a.i",
+    "3.A.1.b": "3.A.1.a.ii",
+    "3.A.2": "3.A.1.c",
+    "3.A.3": "3.A.1.h",
+    "3.A.4.a": "3.A.1.b",
+    "3.A.4.b": "3.A.1.d",
+    "3.A.4.c": "3.A.1.f",
+    "3.A.4.d": "3.A.1.g",
+    "3.A.4.e": "3.A.1.i",
+    "3.B": "3.A.2",
+    "3.B.1": "3.A.2.a",
+    "3.B.1.a": "3.A.2.a.i",
+    "3.B.1.b": "3.A.2.a.ii",
+    "3.B.2": "3.A.2.c",
+    "3.B.3": "3.A.2.h",
+    "3.B.4.a": "3.A.2.b",
+    "3.B.4.b": "3.A.2.d",
+    "3.B.4.c": "3.A.2.f",
+    "3.B.4.d": "3.A.2.g",
+    "3.B.4.e": "3.A.2.i",
+    "3.B.5": "3.C.6",
+    "3.C": "3.C.7",
+    "3.D.1": "3.C.4",
+    "3.D.2": "3.C.5",
+    "3.D.2.a": "3.C.5.a",  # not in climate_categories
+    "3.D.2.b": "3.C.5.b",  # not in climate_categories
+    "3.E": "3.C.1.c",
+    "3.F": "3.C.1.b",
+    "3.G": "3.C.2",
+    "3.H": "3.C.3",
+    "3.I": "3.C.8.a",  # merge this with cat below
+    "3.J": "3.C.8.b",  # merge with cat above
+    "4": "M.LULUCF",
+    "4.A": "3.B.1",
+    "4.A.1": "3.B.1.a",
+    "4.A.2": "3.B.1.b",
+    "4.A.2.a": "3.B.1.b.i",
+    "4.A.2.b": "3.B.1.b.ii",
+    "4.A.2.c": "3.B.1.b.iii",
+    "4.A.2.d": "3.B.1.b.iv",
+    "4.A.2.e": "3.B.1.b.v",
+    "4.B": "3.B.2",
+    "4.B.1": "3.B.2.a",
+    "4.B.2": "3.B.2.b",
+    "4.B.2.a": "3.B.2.b.i",
+    "4.B.2.b": "3.B.2.b.ii",
+    "4.B.2.c": "3.B.2.b.iii",
+    "4.B.2.d": "3.B.2.b.iv",
+    "4.B.2.e": "3.B.2.b.v",
+    "4.C": "3.B.3",
+    "4.C.1": "3.B.3.a",
+    "4.C.2": "3.B.3.b",
+    "4.C.2.a": "3.B.3.b.i",
+    "4.C.2.b": "3.B.3.b.ii",
+    "4.C.2.c": "3.B.3.b.iii",
+    "4.C.2.d": "3.B.3.b.iv",
+    "4.C.2.e": "3.B.3.b.v",
+    "4.D": "3.B.4",
+    "4.D.1": "3.B.4.a",
+    "4.D.2": "3.B.4.b",
+    "4.D.2.a": "3.B.4.b.i",
+    "4.D.2.b": "3.B.4.b.ii",
+    "4.D.2.c": "3.B.4.b.iii",
+    "4.D.2.d": "3.B.4.b.iv",
+    "4.D.2.e": "3.B.4.b.v",
+    "4.E": "3.B.5",
+    "4.E.1": "3.B.5.a",
+    "4.E.2": "3.B.5.b",
+    "4.E.2.a": "3.B.5.b.i",
+    "4.E.2.b": "3.B.5.b.ii",
+    "4.E.2.c": "3.B.5.b.iii",
+    "4.E.2.d": "3.B.5.b.iv",
+    "4.E.2.e": "3.B.5.b.v",
+    "4.F": "3.B.6",
+    "4.F.1": "3.B.6.a",
+    "4.F.2": "3.B.6.b",
+    "4.F.2.a": "3.B.6.b.i",
+    "4.F.2.b": "3.B.6.b.ii",
+    "4.F.2.c": "3.B.6.b.iii",
+    "4.F.2.d": "3.B.6.b.iv",
+    "4.F.2.e": "3.B.6.b.v",
+    "4.G": "3.D.1",
+    "4.H": "3.D.2",
+    "5": "4",
+    "5.A": "4.A",
+    "5.B": "4.B",
+    "5.C": "4.C",
+    "5.C.1": "4.C.1",
+    "5.C.2": "4.C.2",
+    "5.D": "4.D",
+    "5.D.1": "4.D.1",
+    "5.D.2": "4.D.2",
+    "5.E": "4.E",
 }
 }
 
 
 # comments
 # comments
@@ -176,11 +305,29 @@ cat_mapping = { # categories not listed here have the same UNFCCC_GHG_data as in
 # '3.A.4.g.ii.',
 # '3.A.4.g.ii.',
 
 
 aggregate_cats = {
 aggregate_cats = {
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    '3.C.1': {'sources': ['3.C.1.b','3.C.1.c'], 'name': 'Emissions from Biomass Burning'},
-    '3.C.8': {'sources': ['3.C.8.a', '3.C.8.b'], 'name': 'Other'},
-    '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7', '3.C.8'], 'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    '3.D': {'sources': ['3.D.1', '3.D.2'], 'name': 'Other'},
-    '3': {'sources': ['3.A', '3.B', '3.C', '3.D'], 'name': 'AFOLU'},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "3.C.1": {
+        "sources": ["3.C.1.b", "3.C.1.c"],
+        "name": "Emissions from Biomass Burning",
+    },
+    "3.C.8": {"sources": ["3.C.8.a", "3.C.8.b"], "name": "Other"},
+    "3.C": {
+        "sources": [
+            "3.C.1",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
+            "3.C.5",
+            "3.C.6",
+            "3.C.7",
+            "3.C.8",
+        ],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "3.D": {"sources": ["3.D.1", "3.D.2"], "name": "Other"},
+    "3": {"sources": ["3.A", "3.B", "3.C", "3.D"], "name": "AFOLU"},
 }
 }

+ 90 - 52
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py

@@ -2,14 +2,15 @@
 Read Chile's 2020 inventory from xlsx
 Read Chile's 2020 inventory from xlsx
 
 
 This script reads data from Chile's 2020 national inventory which is underlying BUR4.
 This script reads data from Chile's 2020 national inventory which is underlying BUR4.
- Data is read from the xlsx file"""
+Data is read from the xlsx file
+"""
 
 
 import os
 import os
 import sys
 import sys
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -20,42 +21,49 @@ if __name__ == "__main__":
     # ###
     # ###
 
 
     # folders and files
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
+    input_folder = downloaded_data_path / "UNFCCC" / "Chile" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Chile"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'CHL_BUR4_2021_'
+    output_filename = "CHL_BUR4_2021_"
 
 
-    inventory_file = 'Inventario_Nacional_de_GEI-1990-2018.xlsx'
+    inventory_file = "Inventario_Nacional_de_GEI-1990-2018.xlsx"
     years_to_read = range(1990, 2018 + 1)
     years_to_read = range(1990, 2018 + 1)
 
 
     # configuration for conversion to PRIMAP2 data format
     # configuration for conversion to PRIMAP2 data format
     unit_row = "header"
     unit_row = "header"
     unit_info = {
     unit_info = {
-        'regexp_entity': r'(.*)\s\(.*\)$',
-        'regexp_unit': r'.*\s\((.*)\)$',
-        'default_unit': 'kt',
-        'manual_repl_unit': {
-            'kt CO₂ eq': 'ktCO2eq',
-            'HFC (kt CO₂ eq)': 'ktCO2eq',
-            'PFC (kt CO₂ eq)': 'ktCO2eq',
-            'SF₆ (kt CO₂ eq)': 'ktCO2eq',
+        "regexp_entity": r"(.*)\s\(.*\)$",
+        "regexp_unit": r".*\s\((.*)\)$",
+        "default_unit": "kt",
+        "manual_repl_unit": {
+            "kt CO₂ eq": "ktCO2eq",
+            "HFC (kt CO₂ eq)": "ktCO2eq",
+            "PFC (kt CO₂ eq)": "ktCO2eq",
+            "SF₆ (kt CO₂ eq)": "ktCO2eq",
+        },
+        "manual_repl_entity": {
+            "kt CO₂ eq": "KYOTOGHG (AR4GWP100)",
+            "HFC (kt CO₂ eq)": "HFCS (AR4GWP100)",
+            "PFC (kt CO₂ eq)": "PFCS (AR4GWP100)",
+            "SF₆ (kt CO₂ eq)": "SF6 (AR4GWP100)",
         },
         },
-        'manual_repl_entity': {
-            'kt CO₂ eq': 'KYOTOGHG (AR4GWP100)',
-            'HFC (kt CO₂ eq)': 'HFCS (AR4GWP100)',
-            'PFC (kt CO₂ eq)': 'PFCS (AR4GWP100)',
-            'SF₆ (kt CO₂ eq)': 'SF6 (AR4GWP100)',
-        }
     }
     }
-    cols_to_drop = ['Unnamed: 14', 'Unnamed: 16', 'Código IPCC.1',
-                    'Categorías de fuente y sumidero de gases de efecto invernadero.1']
+    cols_to_drop = [
+        "Unnamed: 14",
+        "Unnamed: 16",
+        "Código IPCC.1",
+        "Categorías de fuente y sumidero de gases de efecto invernadero.1",
+    ]
     # columns for category code and original category name
     # columns for category code and original category name
-    index_cols = ['Código IPCC', 'Categorías de fuente y sumidero de gases de efecto invernadero']
+    index_cols = [
+        "Código IPCC",
+        "Categorías de fuente y sumidero de gases de efecto invernadero",
+    ]
 
 
     # operations on long format DF
     # operations on long format DF
-    cols_for_space_stripping = ['category', 'orig_cat_name', 'entity']
+    cols_for_space_stripping = ["category", "orig_cat_name", "entity"]
 
 
     time_format = "%Y"
     time_format = "%Y"
 
 
@@ -85,7 +93,7 @@ if __name__ == "__main__":
         "source": "CHL-GHG-Inventory",
         "source": "CHL-GHG-Inventory",
         "provenance": "measured",
         "provenance": "measured",
         "area": "CHL",
         "area": "CHL",
-        "scenario": "BUR4"
+        "scenario": "BUR4",
     }
     }
 
 
     coords_value_mapping = {
     coords_value_mapping = {
@@ -117,14 +125,14 @@ if __name__ == "__main__":
     }
     }
 
 
     coords_value_filling = {
     coords_value_filling = {
-        'category': {  # col to fill
-            'orig_cat_name': {  # col to fill from
-                'Todas las emisiones y las absorciones nacionales': '0',  # from value: to value
-                'Tanque internacional': 'M.BK',
-                'Aviación internacional': 'M.BK.A',
-                'Navegación internacional': 'M.BK.M',
-                'Operaciones multilaterales': 'M.MULTIOP',
-                'Emisiones de CO2 de la biomasa': 'M.BIO',
+        "category": {  # col to fill
+            "orig_cat_name": {  # col to fill from (from value: to value)
+                "Todas las emisiones y las absorciones nacionales": "0",
+                "Tanque internacional": "M.BK",
+                "Aviación internacional": "M.BK.A",
+                "Navegación internacional": "M.BK.M",
+                "Operaciones multilaterales": "M.MULTIOP",
+                "Emisiones de CO2 de la biomasa": "M.BIO",
             }
             }
         }
         }
     }
     }
@@ -141,7 +149,9 @@ if __name__ == "__main__":
     filter_keep = {}
     filter_keep = {}
 
 
     meta_data = {
     meta_data = {
-        "references": "https://unfccc.int/documents/267936, https://snichile.mma.gob.cl/wp-content/uploads/2021/03/Inventario_Nacional_de_GEI-1990-2018.xlsx",
+        "references": "https://unfccc.int/documents/267936, "
+        "https://snichile.mma.gob.cl/wp-content/uploads/2021/03/"
+        "Inventario_Nacional_de_GEI-1990-2018.xlsx",
         "rights": "",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
         "contact": "mail@johannes-guetschow.de.de",
         "title": "Chile: BUR4",
         "title": "Chile: BUR4",
@@ -165,16 +175,24 @@ if __name__ == "__main__":
     for year in years_to_read:
     for year in years_to_read:
         # read sheet for the year. Each sheet contains several tables,
         # read sheet for the year. Each sheet contains several tables,
         # we only read the upper row as the other tables are summary tables
         # we only read the upper row as the other tables are summary tables
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=str(year), skiprows=2, nrows=442, engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=str(year),
+            skiprows=2,
+            nrows=442,
+            engine="openpyxl",
+        )
         # drop the columns which are empty and repetition of the metadata for the second block
         # drop the columns which are empty and repetition of the metadata for the second block
-        df_current.drop(cols_to_drop, axis=1, inplace=True)
+        df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set multi-index. necessary for the stack operation in the conversion to long format
         # set multi-index. necessary for the stack operation in the conversion to long format
         df_current = df_current.set_index(index_cols)
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
         # add unit row using information from entity row and add to index
-        df_current = pm2.pm2io.nir_add_unit_information(df_current, unit_row=unit_row, **unit_info)
+        df_current = pm2.pm2io.nir_add_unit_information(
+            df_current, unit_row=unit_row, **unit_info
+        )
         # actual conversion to long format
         # actual conversion to long format
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         # aggregate to one df
         # aggregate to one df
@@ -192,7 +210,7 @@ if __name__ == "__main__":
     for col in cols_for_space_stripping:
     for col in cols_for_space_stripping:
         df_all[col] = df_all[col].str.strip()
         df_all[col] = df_all[col].str.strip()
 
 
-    df_all["category"] = df_all["category"].str.rstrip('.')
+    df_all["category"] = df_all["category"].str.rstrip(".")
 
 
     data_if = pm2.pm2io.convert_long_dataframe_if(
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
         df_all,
@@ -204,11 +222,11 @@ if __name__ == "__main__":
         coords_value_filling=coords_value_filling,
         coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         filter_keep=filter_keep,
-        meta_data=meta_data
+        meta_data=meta_data,
+        time_format="%Y",
     )
     )
 
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
@@ -216,11 +234,16 @@ if __name__ == "__main__":
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
     # ###
     # ###
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # conversion to ipcc 2006 categories
     # conversion to ipcc 2006 categories
@@ -236,10 +259,10 @@ if __name__ == "__main__":
         coords_value_filling=coords_value_filling,
         coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         filter_keep=filter_keep,
-        meta_data=meta_data
+        meta_data=meta_data,
     )
     )
 
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
 
 
@@ -252,10 +275,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -263,7 +286,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity', 'unit']).sum()
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -274,12 +305,19 @@ if __name__ == "__main__":
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
 
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 100 - 55
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py

@@ -1,12 +1,16 @@
-# this script reads data from Chile's 2020 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Chile's 2022 inventory from xlsx
+
+This script reads data from Chile's 2022 national inventory which is underlying BUR5.
+Data is read from the xlsx file
+"""
 
 
 import os
 import os
 import sys
 import sys
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -17,43 +21,50 @@ if __name__ == "__main__":
     # ###
     # ###
 
 
     # folders and files
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR5'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
+    input_folder = downloaded_data_path / "UNFCCC" / "Chile" / "BUR5"
+    output_folder = extracted_data_path / "UNFCCC" / "Chile"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'CHL_BUR5_2022_'
+    output_filename = "CHL_BUR5_2022_"
 
 
-    inventory_file = '2022_GEI_CL.xlsx'
+    inventory_file = "2022_GEI_CL.xlsx"
     years_to_read = range(1990, 2020 + 1)
     years_to_read = range(1990, 2020 + 1)
-    time_format='%Y'
+    time_format = "%Y"
 
 
     # configuration for conversion to PRIMAP2 data format
     # configuration for conversion to PRIMAP2 data format
     unit_row = "header"
     unit_row = "header"
     unit_info = {
     unit_info = {
-        'regexp_entity': r'(.*)\s\(.*\)$',
-        'regexp_unit': r'.*\s\((.*)\)$',
-        'default_unit': 'kt',
-        'manual_repl_unit': {
-            'kt CO₂ eq': 'ktCO2eq',
-            'HFC (kt CO₂ eq)': 'ktCO2eq',
-            'PFC (kt CO₂ eq)': 'ktCO2eq',
-            'SF₆ (kt CO₂ eq)': 'ktCO2eq',
+        "regexp_entity": r"(.*)\s\(.*\)$",
+        "regexp_unit": r".*\s\((.*)\)$",
+        "default_unit": "kt",
+        "manual_repl_unit": {
+            "kt CO₂ eq": "ktCO2eq",
+            "HFC (kt CO₂ eq)": "ktCO2eq",
+            "PFC (kt CO₂ eq)": "ktCO2eq",
+            "SF₆ (kt CO₂ eq)": "ktCO2eq",
+        },
+        "manual_repl_entity": {
+            "kt CO₂ eq": "KYOTOGHG (AR4GWP100)",
+            "HFC (kt CO₂ eq)": "HFCS (AR4GWP100)",
+            "PFC (kt CO₂ eq)": "PFCS (AR4GWP100)",
+            "SF₆ (kt CO₂ eq)": "SF6 (AR4GWP100)",
         },
         },
-        'manual_repl_entity': {
-            'kt CO₂ eq': 'KYOTOGHG (AR4GWP100)',
-            'HFC (kt CO₂ eq)': 'HFCS (AR4GWP100)',
-            'PFC (kt CO₂ eq)': 'PFCS (AR4GWP100)',
-            'SF₆ (kt CO₂ eq)': 'SF6 (AR4GWP100)',
-        }
     }
     }
-    cols_to_drop = ['Unnamed: 14', 'Unnamed: 16', 'Código IPCC.1',
-                    'Categorías de fuente y sumidero de gases de efecto invernadero.1']
+    cols_to_drop = [
+        "Unnamed: 14",
+        "Unnamed: 16",
+        "Código IPCC.1",
+        "Categorías de fuente y sumidero de gases de efecto invernadero.1",
+    ]
     # columns for category code and original category name
     # columns for category code and original category name
-    index_cols = ['Código IPCC', 'Categorías de fuente y sumidero de gases de efecto invernadero']
+    index_cols = [
+        "Código IPCC",
+        "Categorías de fuente y sumidero de gases de efecto invernadero",
+    ]
 
 
     # operations on long format DF
     # operations on long format DF
-    cols_for_space_stripping = ['category', 'orig_cat_name', 'entity']
+    cols_for_space_stripping = ["category", "orig_cat_name", "entity"]
 
 
     time_format = "%Y"
     time_format = "%Y"
 
 
@@ -83,7 +94,7 @@ if __name__ == "__main__":
         "source": "CHL-GHG-Inventory",
         "source": "CHL-GHG-Inventory",
         "provenance": "measured",
         "provenance": "measured",
         "area": "CHL",
         "area": "CHL",
-        "scenario": "BUR5"
+        "scenario": "BUR5",
     }
     }
 
 
     coords_value_mapping = {
     coords_value_mapping = {
@@ -115,14 +126,14 @@ if __name__ == "__main__":
     }
     }
 
 
     coords_value_filling = {
     coords_value_filling = {
-        'category': {  # col to fill
-            'orig_cat_name': {  # col to fill from
-                'Todas las emisiones y las absorciones nacionales': '0',  # from value: to value
-                'Tanque internacional': 'M.BK',
-                'Aviación internacional': 'M.BK.A',
-                'Navegación internacional': 'M.BK.M',
-                'Operaciones multilaterales': 'M.MULTIOP',
-                'Emisiones de CO2 de la biomasa': 'M.BIO',
+        "category": {  # col to fill
+            "orig_cat_name": {  # col to fill from (from value: to value)
+                "Todas las emisiones y las absorciones nacionales": "0",
+                "Tanque internacional": "M.BK",
+                "Aviación internacional": "M.BK.A",
+                "Navegación internacional": "M.BK.M",
+                "Operaciones multilaterales": "M.MULTIOP",
+                "Emisiones de CO2 de la biomasa": "M.BIO",
             }
             }
         }
         }
     }
     }
@@ -132,14 +143,19 @@ if __name__ == "__main__":
             "entity": ["Absorciones CO₂", "Emisiones CO₂"],
             "entity": ["Absorciones CO₂", "Emisiones CO₂"],
         },
         },
         "f2": {
         "f2": {
-            "orig_cat_name": ["Partidas informativas", "Todas las emisiones nacionales"],
+            "orig_cat_name": [
+                "Partidas informativas",
+                "Todas las emisiones nacionales",
+            ],
         },
         },
     }
     }
 
 
     filter_keep = {}
     filter_keep = {}
 
 
     meta_data = {
     meta_data = {
-        "references": "https://unfccc.int/documents/624735, https://snichile.mma.gob.cl/wp-content/uploads/2023/04/2022_GEI_CL.xlsx",
+        "references": "https://unfccc.int/documents/624735, "
+        "https://snichile.mma.gob.cl/wp-content/uploads/2023/04/"
+        "2022_GEI_CL.xlsx",
         "rights": "",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
         "contact": "mail@johannes-guetschow.de.de",
         "title": "Chile: BUR5",
         "title": "Chile: BUR5",
@@ -163,16 +179,24 @@ if __name__ == "__main__":
     for year in years_to_read:
     for year in years_to_read:
         # read sheet for the year. Each sheet contains several tables,
         # read sheet for the year. Each sheet contains several tables,
         # we only read the upper row as the other tables are summary tables
         # we only read the upper row as the other tables are summary tables
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=str(year), skiprows=2, nrows=442, engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=str(year),
+            skiprows=2,
+            nrows=442,
+            engine="openpyxl",
+        )
         # drop the columns which are empty and repetition of the metadata for the second block
         # drop the columns which are empty and repetition of the metadata for the second block
-        df_current.drop(cols_to_drop, axis=1, inplace=True)
+        df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set multi-index. necessary for the stack operation in the conversion to long format
         # set multi-index. necessary for the stack operation in the conversion to long format
         df_current = df_current.set_index(index_cols)
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
         # add unit row using information from entity row and add to index
-        df_current = pm2.pm2io.nir_add_unit_information(df_current, unit_row=unit_row, **unit_info)
+        df_current = pm2.pm2io.nir_add_unit_information(
+            df_current, unit_row=unit_row, **unit_info
+        )
         # actual conversion to long format
         # actual conversion to long format
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         # aggregate to one df
         # aggregate to one df
@@ -190,7 +214,7 @@ if __name__ == "__main__":
     for col in cols_for_space_stripping:
     for col in cols_for_space_stripping:
         df_all[col] = df_all[col].str.strip()
         df_all[col] = df_all[col].str.strip()
 
 
-    df_all["category"] = df_all["category"].str.rstrip('.')
+    df_all["category"] = df_all["category"].str.rstrip(".")
 
 
     data_if = pm2.pm2io.convert_long_dataframe_if(
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
         df_all,
@@ -206,8 +230,7 @@ if __name__ == "__main__":
         time_format=time_format,
         time_format=time_format,
     )
     )
 
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
@@ -215,11 +238,16 @@ if __name__ == "__main__":
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
     # ###
     # ###
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # conversion to ipcc 2006 categories
     # conversion to ipcc 2006 categories
@@ -236,10 +264,10 @@ if __name__ == "__main__":
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
-        time_format=time_format
+        time_format=time_format,
     )
     )
 
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
 
 
@@ -252,10 +280,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -263,8 +291,18 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity', 'unit']).sum()
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+            df_combine = df_combine.drop(
+                columns=["category (IPCC2006_PRIMAP)", "orig_cat_name"]
+            )
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -275,12 +313,19 @@ if __name__ == "__main__":
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
 
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 1
src/unfccc_ghg_data/unfccc_reader/Colombia/__init__.py

@@ -1 +1,30 @@
-"""Code to read Colombia's submissions"""
+"""Read Colombia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'COL'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=COL
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 104 - 84
src/unfccc_ghg_data/unfccc_reader/Colombia/read_COL_BUR3_from_xlsx.py

@@ -1,6 +1,10 @@
-# this script reads data from Colombia's BUR3
-# Data is read from the xlsx file which has been exported from the google docs
-# spreadsheet which is linked in the BUR
+"""
+Read Colombia's 2020 inventory from xlsx
+
+this script reads data from Colombia's BUR3
+Data is read from the xlsx file which has been exported from the google docs
+spreadsheet which is linked in the BUR
+"""
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
@@ -14,17 +18,17 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Colombia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Colombia"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'COL_BUR3_2022_'
+    output_filename = "COL_BUR3_2022_"
 
 
-    inventory_file = 'TR_1990-2018_BUR3-AR5_VF.xlsx'
+    inventory_file = "TR_1990-2018_BUR3-AR5_VF.xlsx"
     years_to_read = range(1990, 2018 + 1)
     years_to_read = range(1990, 2018 + 1)
 
 
-    sheet_to_read = 'TR 1990-2018'
+    sheet_to_read = "TR 1990-2018"
     cols_to_read = range(0, 47)
     cols_to_read = range(0, 47)
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
@@ -37,7 +41,6 @@ if __name__ == "__main__":
         "unit": "unit",
         "unit": "unit",
     }
     }
 
 
-
     coords_terminologies = {
     coords_terminologies = {
         "area": "ISO3",
         "area": "ISO3",
         "category": "IPCC2006",
         "category": "IPCC2006",
@@ -54,53 +57,52 @@ if __name__ == "__main__":
     coords_value_mapping = {
     coords_value_mapping = {
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "entity": {
         "entity": {
-            'Absorciones CO2': 'CO2 Absorptions',
-            'Emisiones CO2': 'CO2 Emissions',
-            'Emisiones netas (AR5GWP100)': 'KYOTOGHG (AR5GWP100)',
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
+            "Absorciones CO2": "CO2 Absorptions",
+            "Emisiones CO2": "CO2 Emissions",
+            "Emisiones netas (AR5GWP100)": "KYOTOGHG (AR5GWP100)",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
             #'HFC-41': 'HFC41',
             #'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'HFC-125': 'HFC125',
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
             #'HFC-134': 'HFC134',
             #'HFC-134': 'HFC134',
-            'HFC-134a': 'HFC134a',
-            'HFC-152a': 'HFC152a',
+            "HFC-134a": "HFC134a",
+            "HFC-152a": "HFC152a",
             #'HFC-143': 'HFC143',
             #'HFC-143': 'HFC143',
-            'HFC-143a': 'HFC143a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-236fa': 'HFC236fa',
+            "HFC-143a": "HFC143a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236fa": "HFC236fa",
             #'HFC-245ca': 'HFC245ca',
             #'HFC-245ca': 'HFC245ca',
-            'HFC-245fa': 'HFC245fa',
-            'HFC-365mfc': 'HFC365mfc',
-            'PFC-116': 'C2F6',
-            'PFC-14': 'CF4',
+            "HFC-245fa": "HFC245fa",
+            "HFC-365mfc": "HFC365mfc",
+            "PFC-116": "C2F6",
+            "PFC-14": "CF4",
         },
         },
     }
     }
 
 
-
     filter_remove = {
     filter_remove = {
         "fGWP": {
         "fGWP": {
             "entity": [
             "entity": [
-                'Absorciones CO2 (AR5GWP100)',
-                'Absorciones totales (AR5GWP100)',
-                'CH4 (AR5GWP100)',
-                'Emisiones CO2 (AR5GWP100)',
-                'Total emisiones (AR5GWP100)',
-                'HFC-125 (AR5GWP100)',
-                'HFC-134a (AR5GWP100)',
-                'HFC-143a (AR5GWP100)',
-                'HFC-152a (AR5GWP100)',
-                'HFC-227ea (AR5GWP100)',
-                'HFC-23 (AR5GWP100)',
-                'HFC-236fa (AR5GWP100)',
-                'HFC-245fa (AR5GWP100)',
-                'HFC-32 (AR5GWP100)',
-                'HFC-365mfc (AR5GWP100)',
-                'HFC-43-10mee (AR5GWP100)',
-                'N2O (AR5GWP100)',
-                'PFC-116 (AR5GWP100)',
-                'PFC-14 (AR5GWP100)',
-                'SF6 (AR5GWP100)',
+                "Absorciones CO2 (AR5GWP100)",
+                "Absorciones totales (AR5GWP100)",
+                "CH4 (AR5GWP100)",
+                "Emisiones CO2 (AR5GWP100)",
+                "Total emisiones (AR5GWP100)",
+                "HFC-125 (AR5GWP100)",
+                "HFC-134a (AR5GWP100)",
+                "HFC-143a (AR5GWP100)",
+                "HFC-152a (AR5GWP100)",
+                "HFC-227ea (AR5GWP100)",
+                "HFC-23 (AR5GWP100)",
+                "HFC-236fa (AR5GWP100)",
+                "HFC-245fa (AR5GWP100)",
+                "HFC-32 (AR5GWP100)",
+                "HFC-365mfc (AR5GWP100)",
+                "HFC-43-10mee (AR5GWP100)",
+                "N2O (AR5GWP100)",
+                "PFC-116 (AR5GWP100)",
+                "PFC-14 (AR5GWP100)",
+                "SF6 (AR5GWP100)",
             ],
             ],
         },
         },
     }
     }
@@ -116,25 +118,33 @@ if __name__ == "__main__":
         "institution": "UNFCCC",
         "institution": "UNFCCC",
     }
     }
 
 
-
     # read the data
     # read the data
-    data_raw = pd.read_excel(input_folder / inventory_file, sheet_name=sheet_to_read,
-                             skiprows=0, nrows=15025, usecols=cols_to_read,
-                             engine="openpyxl", header=None)
+    data_raw = pd.read_excel(
+        input_folder / inventory_file,
+        sheet_name=sheet_to_read,
+        skiprows=0,
+        nrows=15025,
+        usecols=cols_to_read,
+        engine="openpyxl",
+        header=None,
+    )
 
 
     # fill the units to the right as for merged cells the unit is only in the first cell
     # fill the units to the right as for merged cells the unit is only in the first cell
-    data_raw.iloc[unit_row] = data_raw.iloc[unit_row].fillna(axis=0, method="ffill")
+    data_raw.iloc[unit_row] = data_raw.iloc[unit_row].ffill(axis=0)
     merge_rows = [1, 2]
     merge_rows = [1, 2]
     for row in merge_rows:
     for row in merge_rows:
         data_raw.iloc[row] = data_raw.iloc[row].astype(str).str.replace("nan", "")
         data_raw.iloc[row] = data_raw.iloc[row].astype(str).str.replace("nan", "")
     data_raw.iloc[merge_rows[0]] = (
     data_raw.iloc[merge_rows[0]] = (
-    data_raw.iloc[merge_rows[0]].astype(str) + " " + data_raw.iloc[
-            merge_rows[1]].astype(str))
+        data_raw.iloc[merge_rows[0]].astype(str)
+        + " "
+        + data_raw.iloc[merge_rows[1]].astype(str)
+    )
     data_raw.iloc[merge_rows[0]] = data_raw.iloc[merge_rows[0]].str.strip()
     data_raw.iloc[merge_rows[0]] = data_raw.iloc[merge_rows[0]].str.strip()
     data_raw = data_raw.drop(index=data_raw.index[merge_rows[1]])
     data_raw = data_raw.drop(index=data_raw.index[merge_rows[1]])
 
 
     # merge the category cols
     # merge the category cols
     def join_code_parts(series):
     def join_code_parts(series):
+        """Create a code from the data in the individual columns"""
         code = series.iloc[0]
         code = series.iloc[0]
         for part in series.iloc[1:]:
         for part in series.iloc[1:]:
             if part != "nan":
             if part != "nan":
@@ -143,10 +153,11 @@ if __name__ == "__main__":
             code = "0"
             code = "0"
         return code
         return code
 
 
-    cat_columns = [0, 1, 2, 3, 4, 5] # xlsx cols are ["MOD","CAP","CAT","SCAT","NROM",
+    cat_columns = [0, 1, 2, 3, 4, 5]  # xlsx cols are ["MOD","CAP","CAT","SCAT","NROM",
     # "NUM"]
     # "NUM"]
-    data_raw["category"] = data_raw[cat_columns].astype(str).agg(func=join_code_parts,
-                                                                 axis=1)
+    data_raw["category"] = (
+        data_raw[cat_columns].astype(str).agg(func=join_code_parts, axis=1)
+    )
     data_raw = data_raw.drop(columns=cat_columns)
     data_raw = data_raw.drop(columns=cat_columns)
 
 
     # prepare the dataframe for processig with primap2 functions
     # prepare the dataframe for processig with primap2 functions
@@ -162,27 +173,29 @@ if __name__ == "__main__":
     for year in years:
     for year in years:
         data_year = data_raw[data_raw["ANO"] == year]
         data_year = data_raw[data_raw["ANO"] == year]
         data_year = data_year.drop(columns=["ANO", "Categorías de fuente y sumideros"])
         data_year = data_year.drop(columns=["ANO", "Categorías de fuente y sumideros"])
-        df_long_new = pm2.pm2io.nir_convert_df_to_long(data_year, year,
-                                                       ["category", "unit", "entity",
-                                                        "time", "data"])
+        df_long_new = pm2.pm2io.nir_convert_df_to_long(
+            data_year, year, ["category", "unit", "entity", "time", "data"]
+        )
         if df_all is None:
         if df_all is None:
             df_all = df_long_new
             df_all = df_long_new
         else:
         else:
-            df_all = pd.concat([df_all, df_long_new], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_long_new], axis=0, join="outer")
 
 
     df_all["category"] = df_all["category"].str[0]
     df_all["category"] = df_all["category"].str[0]
 
 
     # map units
     # map units
-    df_all["unit"] = df_all["unit"].replace({
-        'GEI DIRECTOS - Gg ': 'Gg',
-        'GEI DIRECTOS - Gg CO2 equivalente': 'GgCO2eq',
-    }
+    df_all["unit"] = df_all["unit"].replace(
+        {
+            "GEI DIRECTOS - Gg ": "Gg",
+            "GEI DIRECTOS - Gg CO2 equivalente": "GgCO2eq",
+        }
     )
     )
 
 
     # add GWP information to entity
     # add GWP information to entity
     for entity in df_all["entity"].unique():
     for entity in df_all["entity"].unique():
-        df_all["entity"][(df_all["entity"] == entity) & (
-                    df_all["unit"] == "GgCO2eq")] = f"{entity} (AR5GWP100)"
+        df_all["entity"][
+            (df_all["entity"] == entity) & (df_all["unit"] == "GgCO2eq")
+        ] = f"{entity} (AR5GWP100)"
 
 
     # reset index before conversion to pm2 IF
     # reset index before conversion to pm2 IF
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
@@ -196,26 +209,25 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_long_dataframe_if(
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
         df_all,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
-        convert_str=True
-        )
-
+        convert_str=True,
+        time_format="%Y",
+    )
 
 
     # combine CO2 emissions and absorptions
     # combine CO2 emissions and absorptions
-    data_CO2 = data_if[data_if["entity"].isin([
-        'CO2 Absorptions', 'CO2 Emissions'])]
+    data_CO2 = data_if[data_if["entity"].isin(["CO2 Absorptions", "CO2 Emissions"])]
 
 
-    time_format = '%Y'
+    time_format = "%Y"
     time_columns = [
     time_columns = [
         col
         col
-        for col in data_CO2.columns.values
+        for col in data_CO2.columns.to_numpy()
         if matches_time_format(col, time_format)
         if matches_time_format(col, time_format)
     ]
     ]
 
 
@@ -223,20 +235,23 @@ if __name__ == "__main__":
         data_CO2[col] = pd.to_numeric(data_CO2[col], errors="coerce")
         data_CO2[col] = pd.to_numeric(data_CO2[col], errors="coerce")
 
 
     data_CO2 = data_CO2.groupby(
     data_CO2 = data_CO2.groupby(
-        by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)',
+        by=[
+            "source",
+            "scenario (PRIMAP)",
+            "provenance",
+            "area (ISO3)",
             f"category ({coords_terminologies['category']})",
             f"category ({coords_terminologies['category']})",
-            'unit']).sum(min_count = 1)
+            "unit",
+        ]
+    ).sum(min_count=1)
 
 
-    data_CO2.insert(0, 'entity', 'CO2')
+    data_CO2.insert(0, "entity", "CO2")
     data_CO2 = data_CO2.reset_index()
     data_CO2 = data_CO2.reset_index()
 
 
     data_if = pd.concat([data_if, data_CO2])
     data_if = pd.concat([data_if, data_CO2])
 
 
-
-
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
 
-
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
@@ -245,7 +260,12 @@ if __name__ == "__main__":
     # ###
     # ###
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Indonesia/__init__.py

@@ -0,0 +1,30 @@
+"""Read Indonesia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'IDN'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=IDN
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 167 - 100
src/unfccc_ghg_data/unfccc_reader/Indonesia/read_IDN_BUR3_from_pdf.py

@@ -1,6 +1,11 @@
-# this script reads data from Indonesia's BUR3
-# Data is read from pdf
-# only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+"""
+Read Indonesia's BUR3 from pdf
+
+This script reads data from Indonesia's BUR3
+Data are read from pdf using camelot
+only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+
+"""
 
 
 import camelot
 import camelot
 import numpy as np
 import numpy as np
@@ -14,18 +19,19 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Indonesia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Indonesia"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'IDN_BUR3_2021_'
+    output_filename = "IDN_BUR3_2021_"
 
 
-    inventory_file = 'IndonesiaBUR_3_FINAL_REPORT_2.pdf'
+    inventory_file = "IndonesiaBUR_3_FINAL_REPORT_2.pdf"
 
 
-    gwp_to_use = 'SARGWP100'
+    gwp_to_use = "SARGWP100"
 
 
-    pages_to_read = range(61,65) # 65 is not read properly but contains almost no data anyway, so add it by hand '61-65'
+    pages_to_read = range(61, 65)  # 65 is not read properly but contains almost no
+    # data anyway, so add it by hand
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
@@ -36,17 +42,18 @@ if __name__ == "__main__":
     # special header as category code and name in one column
     # special header as category code and name in one column
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
 
+    time_format = "%Y"
 
 
     # manual category codes
     # manual category codes
     cat_codes_manual = {
     cat_codes_manual = {
-        'Total National Emissions and Removals': '0',
-        'Peat Decomposition': 'M.3.B.4.APD',
-        'Peat Fire': 'M.3.B.4.APF',
-        '4A1.2 Industrial Solid Waste Disposal': 'M.4.A.Ind',
+        "Total National Emissions and Removals": "0",
+        "Peat Decomposition": "M.3.B.4.APD",
+        "Peat Fire": "M.3.B.4.APF",
+        "4A1.2 Industrial Solid Waste Disposal": "M.4.A.Ind",
         #'3A2b Direct N2O Emissions from Manure Management': '3.A.2',
         #'3A2b Direct N2O Emissions from Manure Management': '3.A.2',
     }
     }
 
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9]{1,4})\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9]{1,4})\s.*"
 
 
     coords_cols = {
     coords_cols = {
         "category": "category",
         "category": "category",
@@ -75,24 +82,26 @@ if __name__ == "__main__":
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "category": "PRIMAP1",
         "category": "PRIMAP1",
         "entity": {
         "entity": {
-            'Total 3 Gases': f"CO2CH4N2O ({gwp_to_use})",
-            'Net CO2 (1) (2)': 'CO2',
-            'CH4': f"CH4 ({gwp_to_use})",
-            'N2O': f"N2O ({gwp_to_use})",
-            'HFCs': f"HFCS ({gwp_to_use})",
-            'PFCs': f"PFCS ({gwp_to_use})",
-            'SF6': f"SF6 ({gwp_to_use})",
-            'NOx': 'NOX',
-            'CO': 'CO', # no mapping, just added for completeness here
-            'NMVOCs': 'NMVOC',
-            'SO2': 'SO2', # no mapping, just added for completeness here
-            'Other halogenated gases with CO2 equivalent conversion factors (3)': f"OTHERHFCS ({gwp_to_use})",
+            "Total 3 Gases": f"CO2CH4N2O ({gwp_to_use})",
+            "Net CO2 (1) (2)": "CO2",
+            "CH4": f"CH4 ({gwp_to_use})",
+            "N2O": f"N2O ({gwp_to_use})",
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "NOx": "NOX",
+            "CO": "CO",  # no mapping, just added for completeness here
+            "NMVOCs": "NMVOC",
+            "SO2": "SO2",  # no mapping, just added for completeness here
+            "Other halogenated gases with CO2 equivalent conversion factors (3)": f"OTHERHFCS ({gwp_to_use})",
         },
         },
     }
     }
 
 
-
     filter_remove = {
     filter_remove = {
-        "fHFC": {"entity": 'Other halogenated gases without CO2 equivalent conversion factors (4)'}
+        "fHFC": {
+            "entity": "Other halogenated gases without CO2 equivalent conversion "
+            "factors (4)"
+        }
     }
     }
 
 
     filter_keep = {}
     filter_keep = {}
@@ -107,84 +116,113 @@ if __name__ == "__main__":
     }
     }
 
 
     # convert to mass units where possible
     # convert to mass units where possible
-    entities_to_convert_to_mass = [
-        'CH4', 'N2O', 'SF6'
-    ]
+    entities_to_convert_to_mass = ["CH4", "N2O", "SF6"]
 
 
-    # CO2 equivalents don't make sense for these substances, so unit has to be Gg instead of Gg CO2 equivalents as indicated in the table
-    entities_to_fix_unit = [
-        'NOx', 'CO', 'NMVOCs', 'SO2'
-    ]
+    # CO2 equivalents don't make sense for these substances, so unit has to be Gg
+    # instead of Gg CO2 equivalents as indicated in the table
+    entities_to_fix_unit = ["NOx", "CO", "NMVOCs", "SO2"]
 
 
     # add the data for the last page by hand as it's only one row
     # add the data for the last page by hand as it's only one row
     data_last_page = [
     data_last_page = [
-        ['5B Other (please specify)', 'Total 3 Gases', 'GgCO2eq', '2019', 'NE'],
-        ['5B Other (please specify)', 'Net CO2 (1) (2)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'CH4', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'N2O', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'HFCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'PFCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'SF6', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'Other halogenated gases with CO2 equivalent conversion factors (3)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'Other halogenated gases without CO2 equivalent conversion factors (4)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'NOx', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'CO', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'NMVOCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'SO2', 'GgCO2eq', '2019', np.nan],
+        ["5B Other (please specify)", "Total 3 Gases", "GgCO2eq", "2019", "NE"],
+        ["5B Other (please specify)", "Net CO2 (1) (2)", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "CH4", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "N2O", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "HFCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "PFCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "SF6", "GgCO2eq", "2019", np.nan],
+        [
+            "5B Other (please specify)",
+            "Other halogenated gases with CO2 equivalent conversion factors (3)",
+            "GgCO2eq",
+            "2019",
+            np.nan,
+        ],
+        [
+            "5B Other (please specify)",
+            "Other halogenated gases without CO2 equivalent conversion factors (4)",
+            "GgCO2eq",
+            "2019",
+            np.nan,
+        ],
+        ["5B Other (please specify)", "NOx", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "CO", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "NMVOCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "SO2", "GgCO2eq", "2019", np.nan],
     ]
     ]
 
 
     df_last_page = pd.DataFrame(data_last_page, columns=header_long)
     df_last_page = pd.DataFrame(data_last_page, columns=header_long)
 
 
     aggregate_cats = {
     aggregate_cats = {
-        '1.A.4': {'sources': ['1.A.4.a', '1.A.4.b'], 'name': 'Other Sectors (calculated)'},
-        '2.A.4': {'sources': ['2.A.4.a', '2.A.4.b', '2.A.4.d'], 'name': 'Other Process uses of Carbonates (calculated)'},
-        '2.B.8': {'sources': ['2.B.8.a', '2.B.8.b', '2.B.8.c', '2.B.8.f'], 'name': 'Petrochemical and Carbon Black production (calculated)'},
-        '4.A': {'sources': ['4.A.2', 'M.4.A.Ind'], 'name': 'Solid Waste Disposal (calculated)'},
+        "1.A.4": {
+            "sources": ["1.A.4.a", "1.A.4.b"],
+            "name": "Other Sectors (calculated)",
+        },
+        "2.A.4": {
+            "sources": ["2.A.4.a", "2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates (calculated)",
+        },
+        "2.B.8": {
+            "sources": ["2.B.8.a", "2.B.8.b", "2.B.8.c", "2.B.8.f"],
+            "name": "Petrochemical and Carbon Black production (calculated)",
+        },
+        "4.A": {
+            "sources": ["4.A.2", "M.4.A.Ind"],
+            "name": "Solid Waste Disposal (calculated)",
+        },
     }
     }
 
 
     aggregate_cats_N2O = {
     aggregate_cats_N2O = {
-        '3.A.2': {'sources': ['3.A.2.b'], 'name': '3A2 Manure Management'},
-        '3.A': {'sources': ['3.A.2'], 'name': '3A Livestock'},
+        "3.A.2": {"sources": ["3.A.2.b"], "name": "3A2 Manure Management"},
+        "3.A": {"sources": ["3.A.2"], "name": "3A Livestock"},
     }
     }
 
 
     aggregate_cats_CO2CH4N2O = {
     aggregate_cats_CO2CH4N2O = {
-        '3.A.2': {'sources': ['3.A.2', '3.A.2.b'], 'name': '3A2 Manure Management'},
+        "3.A.2": {"sources": ["3.A.2", "3.A.2.b"], "name": "3A2 Manure Management"},
     }
     }
 
 
     df_all = None
     df_all = None
 
 
     for page in pages_to_read:
     for page in pages_to_read:
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=str(page),
-                                  flavor='lattice')
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=str(page), flavor="lattice"
+        )
         df_this_table = tables[0].df
         df_this_table = tables[0].df
         # replace line breaks, double, and triple spaces in category names
         # replace line breaks, double, and triple spaces in category names
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         # replace line breaks in units and entities
         # replace line breaks in units and entities
-        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace('\n',
-                                                                                    '')
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace('\n', '')
+        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace(
+            "\n", ""
+        )
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace(
+            "\n", ""
+        )
 
 
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_entity=".*",
-                                                           default_unit="GgCO2eq")  # , **unit_info)
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_entity=".*",
+            default_unit="GgCO2eq",
+        )
 
 
         # set index and convert to long format
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year, header_long
+        )
         df_this_table_long["orig_cat_name"] = df_this_table_long["orig_cat_name"].str[0]
         df_this_table_long["orig_cat_name"] = df_this_table_long["orig_cat_name"].str[0]
 
 
         # combine with tables for other sectors (merge not append)
         # combine with tables for other sectors (merge not append)
         if df_all is None:
         if df_all is None:
             df_all = df_this_table_long
             df_all = df_this_table_long
         else:
         else:
-            df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
 
     # add the last page manually
     # add the last page manually
-    df_all = pd.concat([df_all, df_last_page], axis=0, join='outer')
+    df_all = pd.concat([df_all, df_last_page], axis=0, join="outer")
 
 
     # fix the units of aerosols and precursors
     # fix the units of aerosols and precursors
     for entity in entities_to_fix_unit:
     for entity in entities_to_fix_unit:
@@ -196,22 +234,24 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
 
 
     ###### convert to primap2 IF
     ###### convert to primap2 IF
 
 
     # replace "," with "" in data
     # replace "," with "" in data
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(',','', regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(",", "", regex=False)
 
 
     # make sure all col headers are str
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
     df_all.columns = df_all.columns.map(str)
 
 
-
-
     # ###
     # ###
     # convert to PRIMAP2 interchange format
     # convert to PRIMAP2 interchange format
     # ###
     # ###
@@ -222,12 +262,13 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+        time_format=time_format,
+    )
 
 
     cat_label = "category (IPCC2006)"
     cat_label = "category (IPCC2006)"
 
 
@@ -244,10 +285,9 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -255,8 +295,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -267,10 +314,10 @@ if __name__ == "__main__":
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-
     # delete cat 3 for N2O as it's wrong
     # delete cat 3 for N2O as it's wrong
-    index_3A_N2O = data_if[(data_if[cat_label] == '3') &
-                           (data_if['entity'] == 'N2O')].index
+    index_3A_N2O = data_if[
+        (data_if[cat_label] == "3") & (data_if["entity"] == "N2O")
+    ].index
     data_if = data_if.drop(index_3A_N2O)
     data_if = data_if.drop(index_3A_N2O)
 
 
     # aggregate cat 3 for N2O
     # aggregate cat 3 for N2O
@@ -283,10 +330,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -294,11 +341,20 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name", aggregate_cats_N2O[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_cats_N2O[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
@@ -307,8 +363,9 @@ if __name__ == "__main__":
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
     # delete cat 3.A.2 for CO2CH4N2O as it's wrong
     # delete cat 3.A.2 for CO2CH4N2O as it's wrong
-    index_3A2_CO2CH4N2O = data_if[(data_if[cat_label] == '3.A.2') &
-                           (data_if['entity'] == 'CH4CO2N2O (SARGWP100)')].index
+    index_3A2_CO2CH4N2O = data_if[
+        (data_if[cat_label] == "3.A.2") & (data_if["entity"] == "CH4CO2N2O (SARGWP100)")
+    ].index
     data_if = data_if.drop(index_3A2_CO2CH4N2O)
     data_if = data_if.drop(index_3A2_CO2CH4N2O)
 
 
     # aggregate cat 3 for N2O
     # aggregate cat 3 for N2O
@@ -321,10 +378,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -332,11 +389,20 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name", aggregate_cats_CO2CH4N2O[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_cats_CO2CH4N2O[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
@@ -344,7 +410,6 @@ if __name__ == "__main__":
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-
     data_if.attrs = attrs
     data_if.attrs = attrs
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
@@ -372,9 +437,11 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Israel/__init__.py

@@ -0,0 +1,30 @@
+"""Read Israel's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'ISR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=ISR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 409 - 314
src/unfccc_ghg_data/unfccc_reader/Israel/config_isr_bur2.py

@@ -1,73 +1,91 @@
+"""Config for Israel's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 #### configuration for trend tables
 #### configuration for trend tables
 import locale
 import locale
 
 
-gwp_to_use = 'SARGWP100'
-terminology_proc = 'IPCC2006_PRIMAP'
+gwp_to_use = "SARGWP100"
+terminology_proc = "IPCC2006_PRIMAP"
 # bunkers [0,1] need different specs
 # bunkers [0,1] need different specs
 trend_table_def = {
 trend_table_def = {
     # only GHG read, rest dropped
     # only GHG read, rest dropped
-    'GHG': {
-        'tables': [2],
-        'cols_add': {
-            'unit': 'ktCO2eq',
-            'category': '0',
+    "GHG": {
+        "tables": [2],
+        "cols_add": {
+            "unit": "ktCO2eq",
+            "category": "0",
         },
         },
-        'given_col': 'entity',
-        'take_only': ['Total GHG'],
+        "given_col": "entity",
+        "take_only": ["Total GHG"],
     },
     },
-    'CO2': {
-        'tables': [3],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'CO2',
+    "CO2": {
+        "tables": [3],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "CO2",
         },
         },
-        'given_col': 'category',
+        "given_col": "category",
     },
     },
-    'CH4': {
-        'tables': [5],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'CH4',
+    "CH4": {
+        "tables": [5],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "CH4",
         },
         },
-        'given_col': 'category',
-        'take_only': [
-            'Total emissions', 'From fuel combustion',
-            'From Industrial processes', 'From Agriculture'
-        ], # ignore the waste time series as they don't cover the full sector
+        "given_col": "category",
+        "take_only": [
+            "Total emissions",
+            "From fuel combustion",
+            "From Industrial processes",
+            "From Agriculture",
+        ],  # ignore the waste time series as they don't cover the full sector
         # and lead to problems becaus eof the methodology chnage in the inventory
         # and lead to problems becaus eof the methodology chnage in the inventory
     },
     },
-    'N2O': {
-        'tables': [6],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'N2O',
+    "N2O": {
+        "tables": [6],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "N2O",
         },
         },
-        'given_col': 'category',
+        "given_col": "category",
     },
     },
-    'FGases': {
-        'tables': [7],
-        'cols_add': {
-            'unit': 'ktCO2eq',
-            'category': '0',
+    "FGases": {
+        "tables": [7],
+        "cols_add": {
+            "unit": "ktCO2eq",
+            "category": "0",
         },
         },
-        'given_col': 'entity',
+        "given_col": "entity",
     },
     },
 }
 }
 
 
 #### configuration for inventory tables
 #### configuration for inventory tables
 inv_tab_conf = {
 inv_tab_conf = {
-    'unit_row': 0,
-    'entity_row': 0,
-    'regex_unit': r"\((.*)\)",
-    'regex_entity': r"^(.*)\s\(",
-    'index_cols': 'category',
-    'cat_pos': (0, 0),
-    'header_long': ["category", "entity", "unit", "time", "data"],
-    'header_2010': ["2010", "CO2 emissions (Gg)", "CO2 removals (Gg)",
-                  "CH4 (Gg)", "N2O (Gg)", "CO (Gg)", "NOx (Gg)",
-                  "NMVOCs (Gg)", "SOx (Gg)", "SF6 (CO2eq Gg)",
-                  "HFCs (CO2eq Gg)", "PFCs (CO2eq Gg)"],
-    'unit_repl': {
+    "unit_row": 0,
+    "entity_row": 0,
+    "regex_unit": r"\((.*)\)",
+    "regex_entity": r"^(.*)\s\(",
+    "index_cols": "category",
+    "cat_pos": (0, 0),
+    "header_long": ["category", "entity", "unit", "time", "data"],
+    "header_2010": [
+        "2010",
+        "CO2 emissions (Gg)",
+        "CO2 removals (Gg)",
+        "CH4 (Gg)",
+        "N2O (Gg)",
+        "CO (Gg)",
+        "NOx (Gg)",
+        "NMVOCs (Gg)",
+        "SOx (Gg)",
+        "SF6 (CO2eq Gg)",
+        "HFCs (CO2eq Gg)",
+        "PFCs (CO2eq Gg)",
+    ],
+    "unit_repl": {
         "SF6 (CO2e Gg)": "GgCO2eq",
         "SF6 (CO2e Gg)": "GgCO2eq",
         "HFCs (CO2eGg)": "GgCO2eq",
         "HFCs (CO2eGg)": "GgCO2eq",
         "PFCs (CO2e Gg)": "GgCO2eq",
         "PFCs (CO2e Gg)": "GgCO2eq",
@@ -78,13 +96,13 @@ inv_tab_conf = {
 }
 }
 
 
 inv_table_def = {
 inv_table_def = {
-    '1996': {'tables': [1, 2]},
-    '2000': {'tables': [3, 4]},
-    '2005': {'tables': [5, 6]},
-    '2010': {'tables': [7, 8]},
-    '2015': {'tables': [9, 10, 11]},
-    '2019': {'tables': [12, 13, 14]},
-    '2020': {'tables': [15, 16]},
+    "1996": {"tables": [1, 2]},
+    "2000": {"tables": [3, 4]},
+    "2005": {"tables": [5, 6]},
+    "2010": {"tables": [7, 8]},
+    "2015": {"tables": [9, 10, 11]},
+    "2019": {"tables": [12, 13, 14]},
+    "2020": {"tables": [15, 16]},
 }
 }
 
 
 #### configuration for PM2 format
 #### configuration for PM2 format
@@ -110,114 +128,114 @@ coords_defaults = {
 coords_value_mapping = {
 coords_value_mapping = {
     "unit": "PRIMAP1",
     "unit": "PRIMAP1",
     "category": {
     "category": {
-        'Total national emissions and removals': '24540',
-        '0': '24540', # no mapping, just for completeness
-        'Total emissions and removals': '24540',
-        'Total emissions': '24540',
-        '1. Energy': '1',
-        'A. Fuel combustion (sectoral approach)': '1.A',
-        'A. From fuel combustion': '1.A',
-        'From fuel combustion': '1.A',
-        '1. Energy industries': '1.A.1',
-        '2. Manufacturing industries and construction': '1.A.2',
-        '2. Manufacturing, industries and construction': '1.A.2',
-        '3. Transport': '1.A.3',
-        '4. Other sectors': '1.A.4',
-        '4. Other': '1.A.4',
-        'Commercial, institutional residential sectors': '1.A.4.ab', # not BURDI
-        'Commercial, institutional': '1.A.4.a', #not BURDI
-        'residential sectors': '1.A.4.b', #not BURDI
-        'Agriculture, forestry and fishing': '1.A.4.c', # not BURDI
-        '5. Other (please specify)': '1.A.5',
-        'B. Fugitive emissions from fuels': '1.B',
-        '1. Solid fuels': '1.B.1',
-        '2. Oil and natural gas': '1.B.2',
-        '2. Industrial processes': '2',
-        'B. industrial processes': '2',
-        'From Industrial processes': '2',
-        'A. Mineral products': '2.A',
-        'CEMENT PRODUCTION': '2.A.1',
-        'PRODUCTION OF LIME': '2.A.2',
-        'SODA ASH USE': '2.A.4.b',
-        'ROAD PAVING WITH ASPHALT': '2.A.6',
-        'Container Glass': '2.A.7.a',
-        'B. Chemical industry': '2.B',
-        'NITRIC ACID PRODUCTION': '2.B.2',
-        'Ethylene': '2.B.5.b',
-        'PRODUCTION OF OTHER CHEMICALS': '2.B.5.g', #not BURDI
-        'Sulphuric Acid': '2.B.5.f', #not BURDI
-        'C. Metal production': '2.C',
-        'D. Other production': '2.D',
-        'E. Production of halocarbons and sulphur hexafluoride': '2.E',
-        'F. Consumption of halocarbons and sulphur hexafluoride': '2.F',
-        'G. Other (IPPU)': '2.G',
-        '3. Solvent and other product use': '3',
-        '4. Agriculture': '4',
-        'From Agriculture': '4',
-        'From agriculture': '4',
-        'A. Enteric fermentation': '4.A',
-        'B. Manure management': '4.B',
-        'C. Rice cultivation': '4.C',
-        'D. Agricultural soils': '4.D',
-        'E. Prescribed burning of savannahs': '4.E',
-        'F. Field burning of agricultural residues': '4.F',
-        'G. Other (Agri)': '4.G',
-        '5. Land-use change and forestry': '5',
-        'C. Land-use change and forestry': '5',
-        'A. Changes in forest and other woody biomass stocks': '5.A',
-        '2. Changes in forest and other woody biomass stocks': '5.A',
-        'B. Forest and grassland conversion': '5.B',
-        'C. Abandonment of managed lands': '5.C',
-        'D. CO2 emissions and removals from soil': '5.D',
-        '1. CO2 emissions and removals from soil': '5.D',
-        'E. Other (LULUCF)': '5.E',
+        "Total national emissions and removals": "24540",
+        "0": "24540",  # no mapping, just for completeness
+        "Total emissions and removals": "24540",
+        "Total emissions": "24540",
+        "1. Energy": "1",
+        "A. Fuel combustion (sectoral approach)": "1.A",
+        "A. From fuel combustion": "1.A",
+        "From fuel combustion": "1.A",
+        "1. Energy industries": "1.A.1",
+        "2. Manufacturing industries and construction": "1.A.2",
+        "2. Manufacturing, industries and construction": "1.A.2",
+        "3. Transport": "1.A.3",
+        "4. Other sectors": "1.A.4",
+        "4. Other": "1.A.4",
+        "Commercial, institutional residential sectors": "1.A.4.ab",  # not BURDI
+        "Commercial, institutional": "1.A.4.a",  # not BURDI
+        "residential sectors": "1.A.4.b",  # not BURDI
+        "Agriculture, forestry and fishing": "1.A.4.c",  # not BURDI
+        "5. Other (please specify)": "1.A.5",
+        "B. Fugitive emissions from fuels": "1.B",
+        "1. Solid fuels": "1.B.1",
+        "2. Oil and natural gas": "1.B.2",
+        "2. Industrial processes": "2",
+        "B. industrial processes": "2",
+        "From Industrial processes": "2",
+        "A. Mineral products": "2.A",
+        "CEMENT PRODUCTION": "2.A.1",
+        "PRODUCTION OF LIME": "2.A.2",
+        "SODA ASH USE": "2.A.4.b",
+        "ROAD PAVING WITH ASPHALT": "2.A.6",
+        "Container Glass": "2.A.7.a",
+        "B. Chemical industry": "2.B",
+        "NITRIC ACID PRODUCTION": "2.B.2",
+        "Ethylene": "2.B.5.b",
+        "PRODUCTION OF OTHER CHEMICALS": "2.B.5.g",  # not BURDI
+        "Sulphuric Acid": "2.B.5.f",  # not BURDI
+        "C. Metal production": "2.C",
+        "D. Other production": "2.D",
+        "E. Production of halocarbons and sulphur hexafluoride": "2.E",
+        "F. Consumption of halocarbons and sulphur hexafluoride": "2.F",
+        "G. Other (IPPU)": "2.G",
+        "3. Solvent and other product use": "3",
+        "4. Agriculture": "4",
+        "From Agriculture": "4",
+        "From agriculture": "4",
+        "A. Enteric fermentation": "4.A",
+        "B. Manure management": "4.B",
+        "C. Rice cultivation": "4.C",
+        "D. Agricultural soils": "4.D",
+        "E. Prescribed burning of savannahs": "4.E",
+        "F. Field burning of agricultural residues": "4.F",
+        "G. Other (Agri)": "4.G",
+        "5. Land-use change and forestry": "5",
+        "C. Land-use change and forestry": "5",
+        "A. Changes in forest and other woody biomass stocks": "5.A",
+        "2. Changes in forest and other woody biomass stocks": "5.A",
+        "B. Forest and grassland conversion": "5.B",
+        "C. Abandonment of managed lands": "5.C",
+        "D. CO2 emissions and removals from soil": "5.D",
+        "1. CO2 emissions and removals from soil": "5.D",
+        "E. Other (LULUCF)": "5.E",
         # waste in 2006 categories, not BURDI as we will lose info of we map to BURDI and back
         # waste in 2006 categories, not BURDI as we will lose info of we map to BURDI and back
-        '6. Waste': '6',
-        'A. Solid waste disposal on land': '6.A',
-        'From solid waste disposal on land': '6.A',
-        'B. Waste-water handling': '6X.B', # combine with 6.D
-        'From waste-water treatment': '6X.B', # not BURDI
-        'C. Waste incineration': '6.C',
-        'D. Other (please specify)': '6X.D', # combine with 6.E
-        'B. Biological Treatment of Solid Waste': '6.B', # not BURDI
-        'D.Waste-water handling': '6.D', # not BURDI
-        'D. Waste-water handling': '6.D', # not BURDI
-        'E. Other (Waste)': '6.E', # not BURDI
-        '7. Other (please specify)': '7',
-        'International bunkers': '14637',
-        'Aviation': '14424',
-        'Marine': '14423',
-        'CO2 emissions from biomass': '14638',
+        "6. Waste": "6",
+        "A. Solid waste disposal on land": "6.A",
+        "From solid waste disposal on land": "6.A",
+        "B. Waste-water handling": "6X.B",  # combine with 6.D
+        "From waste-water treatment": "6X.B",  # not BURDI
+        "C. Waste incineration": "6.C",
+        "D. Other (please specify)": "6X.D",  # combine with 6.E
+        "B. Biological Treatment of Solid Waste": "6.B",  # not BURDI
+        "D.Waste-water handling": "6.D",  # not BURDI
+        "D. Waste-water handling": "6.D",  # not BURDI
+        "E. Other (Waste)": "6.E",  # not BURDI
+        "7. Other (please specify)": "7",
+        "International bunkers": "14637",
+        "Aviation": "14424",
+        "Marine": "14423",
+        "CO2 emissions from biomass": "14638",
     },
     },
     "entity": {
     "entity": {
-        'Total GHG': f'KYOTOGHG ({gwp_to_use})',
-        'Carbon Dioxide (CO2)': 'CO2',
-        'CO2': 'CO2', # no mapping, just added for completeness here
-        'CO2 emissions': 'CO2 emissions', # no mapping, just added for completeness here
-        'CO2 removals': 'CO2 removals', # no mapping, just added for completeness here
-        'CO2 Emissions': 'CO2 emissions',
-        'CO2 Removals': 'CO2 removals',
-        'Methane (CH4)': 'CH4',
-        'CH4': 'CH4', # no mapping, just added for completeness here
-        'Nitrous Oxides (N2O)': 'N2O',
-        'NO2': 'NO2', # no mapping, just added for completeness here
-        'Sulfur hexafluoride (SF6)': f'SF6 ({gwp_to_use})',
-        'SF6': f'SF6 ({gwp_to_use})',
-        "Hydrofluorocarbons (HFC'S)": f'HFCS ({gwp_to_use})',
-        "HFCs": f'HFCS ({gwp_to_use})',
-        "Perfluorocarbons (PFC'S)": f'PFCS ({gwp_to_use})',
-        "PFCs": f'PFCS ({gwp_to_use})',
-        'NOx': 'NOX',
-        'Nox': 'NOX',
-        'Co': 'CO',
-        'CO': 'CO', # no mapping, just added for completeness here
-        'NMVOCs': 'NMVOC',
-        'SOx': 'SOX', # no mapping, just added for completeness here
+        "Total GHG": f"KYOTOGHG ({gwp_to_use})",
+        "Carbon Dioxide (CO2)": "CO2",
+        "CO2": "CO2",  # no mapping, just added for completeness here
+        "CO2 emissions": "CO2 emissions",  # no mapping, just added for completeness here
+        "CO2 removals": "CO2 removals",  # no mapping, just added for completeness here
+        "CO2 Emissions": "CO2 emissions",
+        "CO2 Removals": "CO2 removals",
+        "Methane (CH4)": "CH4",
+        "CH4": "CH4",  # no mapping, just added for completeness here
+        "Nitrous Oxides (N2O)": "N2O",
+        "NO2": "NO2",  # no mapping, just added for completeness here
+        "Sulfur hexafluoride (SF6)": f"SF6 ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "Hydrofluorocarbons (HFC'S)": f"HFCS ({gwp_to_use})",
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "Perfluorocarbons (PFC'S)": f"PFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "NOx": "NOX",
+        "Nox": "NOX",
+        "Co": "CO",
+        "CO": "CO",  # no mapping, just added for completeness here
+        "NMVOCs": "NMVOC",
+        "SOx": "SOX",  # no mapping, just added for completeness here
     },
     },
 }
 }
 
 
 filter_remove = {
 filter_remove = {
-    'rem_cat': {'category': ['Memo items', 'G. Other (please specify)']},
+    "rem_cat": {"category": ["Memo items", "G. Other (please specify)"]},
     #'rem_ent': {'entity': ['GHG per capita', 'GHG per GDP (2015 prices)']},
     #'rem_ent': {'entity': ['GHG per capita', 'GHG per GDP (2015 prices)']},
 }
 }
 
 
@@ -235,76 +253,88 @@ meta_data = {
 #### for processing
 #### for processing
 # aggregate categories
 # aggregate categories
 cats_to_agg = {
 cats_to_agg = {
-    '1': {'sources': ['1.A'], 'name': 'Energy'}, # for trends
-    '1.A.4': {'sources': ['1.A.4.a', '1.A.4.b', '1.A.4.c', '1.A.4.ab'],
-              'name': 'Other sectors'},
-    '2.A.4': {'sources': ['2.A.4.b'], 'name': 'Soda Ash'},
-    '2.A.7': {'sources': ['2.A.7.a'], 'name': 'Other'},
-    '2.A': {'sources': ['2.A.1', '2.A.2', '2.A.4', '2.A.6', '2.A.7'], 'name': 'Mineral Products'},
-    '2.B.5': {'sources': ['2.B.5.f', '2.B.5.g'], 'name': 'Other'},
-    '2.B': {'sources': ['2.B.2', '2.B.5'], 'name': 'Chemical Industry'},
-    '6.D': {'sources': ['6.D', '6X.B'], 'name': 'Wastewater Treatment and Discharge'},
+    "1": {"sources": ["1.A"], "name": "Energy"},  # for trends
+    "1.A.4": {
+        "sources": ["1.A.4.a", "1.A.4.b", "1.A.4.c", "1.A.4.ab"],
+        "name": "Other sectors",
+    },
+    "2.A.4": {"sources": ["2.A.4.b"], "name": "Soda Ash"},
+    "2.A.7": {"sources": ["2.A.7.a"], "name": "Other"},
+    "2.A": {
+        "sources": ["2.A.1", "2.A.2", "2.A.4", "2.A.6", "2.A.7"],
+        "name": "Mineral Products",
+    },
+    "2.B.5": {"sources": ["2.B.5.f", "2.B.5.g"], "name": "Other"},
+    "2.B": {"sources": ["2.B.2", "2.B.5"], "name": "Chemical Industry"},
+    "6.D": {"sources": ["6.D", "6X.B"], "name": "Wastewater Treatment and Discharge"},
     #'6.E': {'sources': ['6.E', '6X.D'], 'Other'}, # currently empty
     #'6.E': {'sources': ['6.E', '6X.D'], 'Other'}, # currently empty
 }
 }
 
 
 # downscale
 # downscale
 # 1.A.4.ab
 # 1.A.4.ab
 downscaling = {
 downscaling = {
-    'sectors': {
-        '24540': {
-            'basket': '24540',
-            'basket_contents': ['2'],
-            'entities': ['SF6', 'HFCS (SARGWP100)', 'PFCS (SARGWP100)'],
-            'dim': f"category ({coords_terminologies['category']})",
+    "sectors": {
+        "24540": {
+            "basket": "24540",
+            "basket_contents": ["2"],
+            "entities": ["SF6", "HFCS (SARGWP100)", "PFCS (SARGWP100)"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '1.A': {
-            'basket': '1.A',
-            'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
-            'tolerance': 0.05, # some inconsistencies (rounding?)
+        "1.A": {
+            "basket": "1.A",
+            "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
+            "tolerance": 0.05,  # some inconsistencies (rounding?)
         },
         },
-        '1.A.4.ab': {
-            'basket': '1.A.4.ab',
-            'basket_contents': ['1.A.4.a', '1.A.4.b'],
-            'entities': ['CO2', 'CH4', 'N2O', 'SOX', 'NOX', 'CO'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "1.A.4.ab": {
+            "basket": "1.A.4.ab",
+            "basket_contents": ["1.A.4.a", "1.A.4.b"],
+            "entities": ["CO2", "CH4", "N2O", "SOX", "NOX", "CO"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '1.A.4': {
-            'basket': '1.A.4',
-            'basket_contents': ['1.A.4.a', '1.A.4.b', '1.A.4.c'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "1.A.4": {
+            "basket": "1.A.4",
+            "basket_contents": ["1.A.4.a", "1.A.4.b", "1.A.4.c"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '2': {
-            'basket': '2',
-            'basket_contents': ['2.A', '2.B', '2.F'],
-            'entities': ['CO2', 'CH4', 'N2O', 'SF6', 'PFCS (SARGWP100)', 'HFCS (SARGWP100)'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2": {
+            "basket": "2",
+            "basket_contents": ["2.A", "2.B", "2.F"],
+            "entities": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "PFCS (SARGWP100)",
+                "HFCS (SARGWP100)",
+            ],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '2.A': {
-            'basket': '2.A',
-            'basket_contents': ['2.A.1', '2.A.2', '2.A.4', '2.A.7'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2.A": {
+            "basket": "2.A",
+            "basket_contents": ["2.A.1", "2.A.2", "2.A.4", "2.A.7"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '2.B': {
-            'basket': '2.B',
-            'basket_contents': ['2.B.2', '2.B.5'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2.B": {
+            "basket": "2.B",
+            "basket_contents": ["2.B.2", "2.B.5"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '4': {
-            'basket': '4',
-            'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E', '4.F', '4.G'],
-            'entities': ['CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "4": {
+            "basket": "4",
+            "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E", "4.F", "4.G"],
+            "entities": ["CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
-        '5': {
-            'basket': '5',
-            'basket_contents': ['5.A', '5.D'], # the other sectors are 0
-            'entities': ['CO2'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "5": {
+            "basket": "5",
+            "basket_contents": ["5.A", "5.D"],  # the other sectors are 0
+            "entities": ["CO2"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
         },
     },
     },
 }
 }
@@ -312,125 +342,190 @@ downscaling = {
 # map to IPCC2006
 # map to IPCC2006
 cat_conversion = {
 cat_conversion = {
     # ANNEXI to come (low priority as we read from CRF files)
     # ANNEXI to come (low priority as we read from CRF files)
-    'mapping': {
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.A.4.a': '1.A.4.a',
-        '1.A.4.b': '1.A.4.b',
-        '1.A.4.c': '1.A.4.c',
-        '1.A.5': '1.A.5', # currently not needed
-        '1.B': '1.B', # currently not needed
-        '1.B.1': '1.B.1', # currently not needed
-        '1.B.2': '1.B.2', # currently not needed
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1', # cement
-        '2.A.2': '2.A.2', # lime
-        '2.A.4': '2.A.4.b', # soda ash
-        '2.A.6': '2.A.5', # road paving with asphalt -> other
-        '2.A.7.a': '2.A.3', # glass
-        '2.B': 'M.2.B_2.B',
-        '2.B.2': '2.B.2', # nitric acid
-        '2.B.5.b': '2.B.8.b', # Ethylene
-        '2.B.5.f': 'M.2.B.10.a', # sulphuric acid
-        '2.B.5.g': 'M.2.B.10.b', # other chemicals
-        '2.C': '2.C',
-        '2.D': 'M.2.H.1_2',
-        '2.E': '2.B.9',
-        '2.F': '2.F',
-        '2.G': '2.H.3',
-        '4': 'M.AG',
-        '4.A': '3.A.1',
-        '4.B': '3.A.2',
-        '4.C': '3.C.7',
-        '4.D': 'M.3.C.45.AG',
-        '4.E': '3.C.1.c',
-        '4.F': '3.C.1.b',
-        '4.G': '3.C.8',
-        '5': 'M.LULUCF',
-        '6': '4',
-        '6.A': '4.A',
-        '6.B': '4.B',
-        '6.C': '4.C',
-        '6.D': '4.D',
-        '24540': '0',
-        '15163': 'M.0.EL',
-        '14637': 'M.BK',
-        '14424': 'M.BK.A',
-        '14423': 'M.BK.M',
-        '14638': 'M.BIO',
-        '7': '5',
-    }, #5.A-D ignored as not fitting 2006 cats
-
-    'aggregate': {
-        '2.A.4': {'sources': ['2.A.4.b'], 'name': 'Other uses of soda ashes'},
-        '2.B.8': {'sources': ['2.B.8.b'], 'name': 'Petrochemical and Carbon Black production'},
-        '2.B.10': {'sources': ['M.2.B.10.a', 'M.2.B.10.b'], 'name': 'Other'},
-        '2.B': {'sources': ['2.B.2', '2.B.8', '2.B.9', '2.B.10'], 'name': 'Chemical Industry'},
-        '2.H': {'sources': ['M.2.H.1_2', '2.H.3'], 'name': 'Other'},
+    "mapping": {
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.A.4.a": "1.A.4.a",
+        "1.A.4.b": "1.A.4.b",
+        "1.A.4.c": "1.A.4.c",
+        "1.A.5": "1.A.5",  # currently not needed
+        "1.B": "1.B",  # currently not needed
+        "1.B.1": "1.B.1",  # currently not needed
+        "1.B.2": "1.B.2",  # currently not needed
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",  # cement
+        "2.A.2": "2.A.2",  # lime
+        "2.A.4": "2.A.4.b",  # soda ash
+        "2.A.6": "2.A.5",  # road paving with asphalt -> other
+        "2.A.7.a": "2.A.3",  # glass
+        "2.B": "M.2.B_2.B",
+        "2.B.2": "2.B.2",  # nitric acid
+        "2.B.5.b": "2.B.8.b",  # Ethylene
+        "2.B.5.f": "M.2.B.10.a",  # sulphuric acid
+        "2.B.5.g": "M.2.B.10.b",  # other chemicals
+        "2.C": "2.C",
+        "2.D": "M.2.H.1_2",
+        "2.E": "2.B.9",
+        "2.F": "2.F",
+        "2.G": "2.H.3",
+        "4": "M.AG",
+        "4.A": "3.A.1",
+        "4.B": "3.A.2",
+        "4.C": "3.C.7",
+        "4.D": "M.3.C.45.AG",
+        "4.E": "3.C.1.c",
+        "4.F": "3.C.1.b",
+        "4.G": "3.C.8",
+        "5": "M.LULUCF",
+        "6": "4",
+        "6.A": "4.A",
+        "6.B": "4.B",
+        "6.C": "4.C",
+        "6.D": "4.D",
+        "24540": "0",
+        "15163": "M.0.EL",
+        "14637": "M.BK",
+        "14424": "M.BK.A",
+        "14423": "M.BK.M",
+        "14638": "M.BIO",
+        "7": "5",
+    },  # 5.A-D ignored as not fitting 2006 cats
+    "aggregate": {
+        "2.A.4": {"sources": ["2.A.4.b"], "name": "Other uses of soda ashes"},
+        "2.B.8": {
+            "sources": ["2.B.8.b"],
+            "name": "Petrochemical and Carbon Black production",
+        },
+        "2.B.10": {"sources": ["M.2.B.10.a", "M.2.B.10.b"], "name": "Other"},
+        "2.B": {
+            "sources": ["2.B.2", "2.B.8", "2.B.9", "2.B.10"],
+            "name": "Chemical Industry",
+        },
+        "2.H": {"sources": ["M.2.H.1_2", "2.H.3"], "name": "Other"},
         # '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
         # '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
         #       'name': 'Industrial Processes and Product Use'},
         #       'name': 'Industrial Processes and Product Use'},
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['3.C.1.b', '3.C.1.c'],
-                     'name': 'Emissions from biomass burning'},
-        'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
-                     'name': 'Emissions from biomass burning (Agriculture)'},
-        '3.C': {'sources': ['3.C.1', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {'sources': ['M.3.C.1.AG', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land ('
-                             'Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National total '
-                                                                    'excluding LULUCF'},
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Emissions from biomass burning",
+        },
+        "M.3.C.1.AG": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Emissions from biomass burning (Agriculture)",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "M.3.C.45.AG", "3.C.7", "3.C.8"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": ["M.3.C.1.AG", "M.3.C.45.AG", "3.C.7", "3.C.8"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land ("
+            "Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total " "excluding LULUCF",
+        },
     },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': 'SARGWP100',
+    "basket_copy": {
+        "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": "SARGWP100",
     },
     },
 }
 }
 
 
 sectors_to_save = [
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.2', '1.A.3', '1.A.4', '1.A.4.a', '1.A.4.b', '1.A.4.c',
-    '1.A.5',
-    '1.B', '1.B.1', '1.B.2',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.5',
-    '2.B', '2.B.2', '2.B.8', '2.B.9', '2.B.10', '2.C', '2.F', '2.H',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', 'M.3.C.1.AG', '3.C.7', 'M.3.C.45.AG', '3.C.8', 'M.3.C.AG',
-    'M.LULUCF', 'M.AG.ELV',
-    '4', '4.A', '4.B', '4.C', '4.D',
-    '0', 'M.0.EL', 'M.BK', 'M.BK.A', 'M.BK.M', 'M.BIO', '5']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.2",
+    "1.A.3",
+    "1.A.4",
+    "1.A.4.a",
+    "1.A.4.b",
+    "1.A.4.c",
+    "1.A.5",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.5",
+    "2.B",
+    "2.B.2",
+    "2.B.8",
+    "2.B.9",
+    "2.B.10",
+    "2.C",
+    "2.F",
+    "2.H",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "M.3.C.1.AG",
+    "3.C.7",
+    "M.3.C.45.AG",
+    "3.C.8",
+    "M.3.C.AG",
+    "M.LULUCF",
+    "M.AG.ELV",
+    "4",
+    "4.A",
+    "4.B",
+    "4.C",
+    "4.D",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BK.A",
+    "M.BK.M",
+    "M.BIO",
+    "5",
+]
 
 
 
 
 # gas baskets
 # gas baskets
 gas_baskets = {
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)': ['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)': ['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 }
 
 
 basket_copy = {
 basket_copy = {
-    'GWPs_to_add': ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
-    'entities': ["HFCS", "PFCS"],
-    'source_GWP': gwp_to_use,
+    "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
+    "entities": ["HFCS", "PFCS"],
+    "source_GWP": gwp_to_use,
 }
 }
 
 
+
 #### functions
 #### functions
 def is_int(input: str) -> bool:
 def is_int(input: str) -> bool:
+    """Check if a string evaluates to an integer under a defined locale"""
     try:
     try:
         locale.atoi(input)
         locale.atoi(input)
-        return True
-    except:
+        return True  # noqa: TRY300
+    except Exception:
         return False
         return False

+ 121 - 77
src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py

@@ -1,4 +1,12 @@
-# read Israel's BUR2 from pdf
+"""
+Read Israel's BUR2 from pdf
+
+This script reads data from Israel's BUR2
+Data are read from pdf using camelot
+only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+
+"""
+
 
 
 # TODO: bunkers trend tables not read because of special format
 # TODO: bunkers trend tables not read because of special format
 
 
@@ -9,7 +17,7 @@ import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
 
 
 # configuration import
 # configuration import
-from .config_isr_bur2 import (
+from config_isr_bur2 import (
     basket_copy,
     basket_copy,
     cat_conversion,
     cat_conversion,
     cats_to_agg,
     cats_to_agg,
@@ -29,23 +37,27 @@ from .config_isr_bur2 import (
     trend_table_def,
     trend_table_def,
 )
 )
 
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     ### genral configuration
     ### genral configuration
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Israel' / 'BUR2'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Israel'
+    input_folder = downloaded_data_path / "UNFCCC" / "Israel" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Israel"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'ISR_BUR2_2021_'
-    inventory_file_pdf = '2nd_Biennial_Update_Report_2021_final.pdf'
-    #years_to_read = range(1990, 2018 + 1)
+    output_filename = "ISR_BUR2_2021_"
+    inventory_file_pdf = "2nd_Biennial_Update_Report_2021_final.pdf"
+    # years_to_read = range(1990, 2018 + 1)
     pages_to_read_trends = range(48, 54)
     pages_to_read_trends = range(48, 54)
     pages_to_read_inventory = range(54, 66)
     pages_to_read_inventory = range(54, 66)
 
 
     # define locale to use for str to float conversion
     # define locale to use for str to float conversion
-    locale_to_use = 'en_IL.UTF-8'
+    locale_to_use = "en_IL.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
@@ -53,40 +65,44 @@ if __name__ == "__main__":
     #### trend tables
     #### trend tables
 
 
     # read
     # read
-    tables_trends = camelot.read_pdf(str(input_folder / inventory_file_pdf), pages=','.join(
-        [str(page) for page in pages_to_read_trends]), flavor='lattice')
+    tables_trends = camelot.read_pdf(
+        str(input_folder / inventory_file_pdf),
+        pages=",".join([str(page) for page in pages_to_read_trends]),
+        flavor="lattice",
+    )
 
 
     # convert to pm2
     # convert to pm2
     table_trends = None
     table_trends = None
     for table in trend_table_def.keys():
     for table in trend_table_def.keys():
         current_def = trend_table_def[table]
         current_def = trend_table_def[table]
         new_table = None
         new_table = None
-        for subtable in current_def['tables']:
+        for subtable in current_def["tables"]:
             if new_table is None:
             if new_table is None:
                 new_table = tables_trends[subtable].df
                 new_table = tables_trends[subtable].df
             else:
             else:
                 new_table = pd.concat([new_table, tables_trends[subtable].df])
                 new_table = pd.concat([new_table, tables_trends[subtable].df])
 
 
-        for col in new_table.columns.values:
+        for col in new_table.columns.to_numpy():
             new_table[col] = new_table[col].str.replace("\n", "")
             new_table[col] = new_table[col].str.replace("\n", "")
 
 
-        new_table.iloc[0, 0] = current_def['given_col']
+        new_table.iloc[0, 0] = current_def["given_col"]
         new_table.columns = new_table.iloc[0]
         new_table.columns = new_table.iloc[0]
         new_table = new_table.drop(labels=[0])
         new_table = new_table.drop(labels=[0])
         new_table = new_table.reset_index(drop=True)
         new_table = new_table.reset_index(drop=True)
 
 
-        if 'take_only' in current_def.keys():
+        if "take_only" in current_def.keys():
             new_table = new_table[
             new_table = new_table[
-                new_table[current_def['given_col']].isin(current_def['take_only'])]
+                new_table[current_def["given_col"]].isin(current_def["take_only"])
+            ]
 
 
-        time_cols = [col for col in new_table.columns.values if is_int(col)]
+        time_cols = [col for col in new_table.columns.to_numpy() if is_int(col)]
         for col in time_cols:
         for col in time_cols:
             # no NE,NA etc, just numbers, so we can just remove the ','
             # no NE,NA etc, just numbers, so we can just remove the ','
-            new_table[col] = new_table[col].str.replace(',', '')
-            new_table[col] = new_table[col].str.replace(' ', '')
+            new_table[col] = new_table[col].str.replace(",", "")
+            new_table[col] = new_table[col].str.replace(" ", "")
 
 
-        for col in current_def['cols_add']:
-            new_table[col] = current_def['cols_add'][col]
+        for col in current_def["cols_add"]:
+            new_table[col] = current_def["cols_add"][col]
 
 
         if table_trends is None:
         if table_trends is None:
             table_trends = new_table
             table_trends = new_table
@@ -108,31 +124,32 @@ if __name__ == "__main__":
         # filter_keep=filter_keep,
         # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-        time_format='%Y'
+        time_format="%Y",
     )
     )
 
 
-
     data_pm2_trends = pm2.pm2io.from_interchange_format(data_if_trends)
     data_pm2_trends = pm2.pm2io.from_interchange_format(data_if_trends)
 
 
     #### inventory tables
     #### inventory tables
     # read inventory tables
     # read inventory tables
     tables_inv = camelot.read_pdf(
     tables_inv = camelot.read_pdf(
         str(input_folder / inventory_file_pdf),
         str(input_folder / inventory_file_pdf),
-        pages=','.join([str(page) for page in pages_to_read_inventory]),
-        flavor='lattice')
+        pages=",".join([str(page) for page in pages_to_read_inventory]),
+        flavor="lattice",
+    )
 
 
     # process
     # process
     table_inv = None
     table_inv = None
     for table in inv_table_def.keys():
     for table in inv_table_def.keys():
         new_table = None
         new_table = None
         print(f"working on year {table}")
         print(f"working on year {table}")
-        for subtable in inv_table_def[table]['tables']:
+        for subtable in inv_table_def[table]["tables"]:
             print(f"adding table {subtable}")
             print(f"adding table {subtable}")
             if new_table is None:
             if new_table is None:
                 new_table = tables_inv[subtable].df
                 new_table = tables_inv[subtable].df
             else:
             else:
-                new_table = pd.concat([new_table, tables_inv[subtable].df], axis=0,
-                                      join='outer')
+                new_table = pd.concat(
+                    [new_table, tables_inv[subtable].df], axis=0, join="outer"
+                )
                 new_table = new_table.reset_index(drop=True)
                 new_table = new_table.reset_index(drop=True)
 
 
             # replace line breaks, double, and triple spaces in category names
             # replace line breaks, double, and triple spaces in category names
@@ -146,75 +163,97 @@ if __name__ == "__main__":
         else:
         else:
             # replace line breaks in units and entities
             # replace line breaks in units and entities
             new_table.iloc[inv_tab_conf["entity_row"]] = new_table.iloc[
             new_table.iloc[inv_tab_conf["entity_row"]] = new_table.iloc[
-                inv_tab_conf["entity_row"]].str.replace('\n', '')
+                inv_tab_conf["entity_row"]
+            ].str.replace("\n", "")
 
 
         # get_year
         # get_year
         year = new_table.iloc[inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]]
         year = new_table.iloc[inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]]
 
 
         # set category col label
         # set category col label
-        new_table.iloc[inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]] = 'category'
+        new_table.iloc[
+            inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]
+        ] = "category"
 
 
         new_table = pm2.pm2io.nir_add_unit_information(
         new_table = pm2.pm2io.nir_add_unit_information(
             new_table,
             new_table,
-            unit_row=inv_tab_conf["unit_row"], entity_row=inv_tab_conf["entity_row"],
-            regexp_entity=inv_tab_conf["regex_entity"], regexp_unit=inv_tab_conf[
-                "regex_unit"],
-            default_unit="", manual_repl_unit=inv_tab_conf["unit_repl"])
+            unit_row=inv_tab_conf["unit_row"],
+            entity_row=inv_tab_conf["entity_row"],
+            regexp_entity=inv_tab_conf["regex_entity"],
+            regexp_unit=inv_tab_conf["regex_unit"],
+            default_unit="",
+            manual_repl_unit=inv_tab_conf["unit_repl"],
+        )
 
 
         # fix individual values
         # fix individual values
-        if table == '1996':
+        if table == "1996":
             loc = new_table[new_table["category"] == "NITRIC ACID PRODUCTION"].index
             loc = new_table[new_table["category"] == "NITRIC ACID PRODUCTION"].index
-            value = new_table.loc[loc, "CH4"].values
+            value = new_table.loc[loc, "CH4"].to_numpy()
             new_table.loc[loc, "N2O"] = value[0, 0]
             new_table.loc[loc, "N2O"] = value[0, 0]
-            new_table.loc[loc, "CH4"] = ''
-        if table == '2015':
+            new_table.loc[loc, "CH4"] = ""
+        if table == "2015":
             loc_total = new_table[
             loc_total = new_table[
-                new_table["category"] == "Total national emissions and removals"].index
-            loc_IPPU = new_table[new_table["category"] == "2. Industrial processes"].index
-            value = new_table.loc[loc_IPPU, "PFCs"].values
+                new_table["category"] == "Total national emissions and removals"
+            ].index
+            loc_IPPU = new_table[
+                new_table["category"] == "2. Industrial processes"
+            ].index
+            value = new_table.loc[loc_IPPU, "PFCs"].to_numpy()
             new_table.loc[loc_total, "PFCs"] = value[0, 0]
             new_table.loc[loc_total, "PFCs"] = value[0, 0]
 
 
         # remove lines with empty category
         # remove lines with empty category
         new_table = new_table.drop(new_table[new_table["category"] == ""].index)
         new_table = new_table.drop(new_table[new_table["category"] == ""].index)
 
 
         # rename E. Other (please specify) according to row above
         # rename E. Other (please specify) according to row above
-        e_locs = list(new_table[new_table["category"] == "E. Other (please specify)"].index)
+        e_locs = list(
+            new_table[new_table["category"] == "E. Other (please specify)"].index
+        )
         for loc in e_locs:
         for loc in e_locs:
             iloc = new_table.index.get_loc(loc)
             iloc = new_table.index.get_loc(loc)
-            if new_table.iloc[iloc - 1]["category"][
-                0] == "D. CO2 emissions and removals from soil":
+            if (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "D. CO2 emissions and removals from soil"
+            ):
                 new_table.loc[loc]["category"] = "E. Other (LULUCF)"
                 new_table.loc[loc]["category"] = "E. Other (LULUCF)"
-            elif new_table.iloc[iloc - 1]["category"][0] in ["D.Waste-water handling",
-                                                             'D. Waste-water handling']:
+            elif new_table.iloc[iloc - 1]["category"][0] in [
+                "D.Waste-water handling",
+                "D. Waste-water handling",
+            ]:
                 new_table.loc[loc]["category"] = "E. Other (Waste)"
                 new_table.loc[loc]["category"] = "E. Other (Waste)"
 
 
         # rename G. Other (please specify) according to row above
         # rename G. Other (please specify) according to row above
-        g_locs = list(new_table[new_table["category"] == "G. Other (please specify)"].index)
+        g_locs = list(
+            new_table[new_table["category"] == "G. Other (please specify)"].index
+        )
         for loc in g_locs:
         for loc in g_locs:
             iloc = new_table.index.get_loc(loc)
             iloc = new_table.index.get_loc(loc)
-            if new_table.iloc[iloc - 1]["category"][
-                0] == "F. Field burning of agricultural residues":
+            if (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "F. Field burning of agricultural residues"
+            ):
                 new_table.loc[loc]["category"] = "G. Other (Agri)"
                 new_table.loc[loc]["category"] = "G. Other (Agri)"
-            elif new_table.iloc[iloc - 1]["category"][
-                0] == "F. Consumption of halocarbons and sulphur hexafluoride":
+            elif (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "F. Consumption of halocarbons and sulphur hexafluoride"
+            ):
                 new_table.loc[loc]["category"] = "G. Other (IPPU)"
                 new_table.loc[loc]["category"] = "G. Other (IPPU)"
 
 
         # set index and convert to long format
         # set index and convert to long format
         new_table = new_table.set_index(inv_tab_conf["index_cols"])
         new_table = new_table.set_index(inv_tab_conf["index_cols"])
-        new_table_long = pm2.pm2io.nir_convert_df_to_long(new_table, year,
-                                                          inv_tab_conf["header_long"])
+        new_table_long = pm2.pm2io.nir_convert_df_to_long(
+            new_table, year, inv_tab_conf["header_long"]
+        )
         # remove line breaks in values
         # remove line breaks in values
         new_table_long["data"] = new_table_long["data"].str.replace("\n", "")
         new_table_long["data"] = new_table_long["data"].str.replace("\n", "")
 
 
         if table_inv is None:
         if table_inv is None:
             table_inv = new_table_long
             table_inv = new_table_long
         else:
         else:
-            table_inv = pd.concat([table_inv, new_table_long], axis=0, join='outer')
+            table_inv = pd.concat([table_inv, new_table_long], axis=0, join="outer")
             table_inv = table_inv.reset_index(drop=True)
             table_inv = table_inv.reset_index(drop=True)
 
 
     # no NE,NA etc, just numbers, so we can just remove the ','
     # no NE,NA etc, just numbers, so we can just remove the ','
-    table_inv["data"] = table_inv["data"].str.replace(',', '')
-    table_inv["data"] = table_inv["data"].str.replace(' ', '')
+    table_inv["data"] = table_inv["data"].str.replace(",", "")
+    table_inv["data"] = table_inv["data"].str.replace(" ", "")
 
 
     # ###
     # ###
     # convert to PRIMAP2 interchange format
     # convert to PRIMAP2 interchange format
@@ -231,14 +270,14 @@ if __name__ == "__main__":
         # filter_keep=filter_keep,
         # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-        time_format='%Y',
+        time_format="%Y",
     )
     )
 
 
     data_pm2_inv = pm2.pm2io.from_interchange_format(data_if_inv)
     data_pm2_inv = pm2.pm2io.from_interchange_format(data_if_inv)
 
 
     #### combine
     #### combine
     # tolerance needs to be high as rounding in trend tables leads to inconsistent data
     # tolerance needs to be high as rounding in trend tables leads to inconsistent data
-    data_pm2 = data_pm2_inv.pr.merge(data_pm2_trends,tolerance=0.11)
+    data_pm2 = data_pm2_inv.pr.merge(data_pm2_trends, tolerance=0.11)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
@@ -248,40 +287,44 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw"), data_if)
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     #### processing
     #### processing
     data_proc_pm2 = data_pm2
     data_proc_pm2 = data_pm2
 
 
     # combine CO2 emissions and removals
     # combine CO2 emissions and removals
     temp_CO2 = data_proc_pm2["CO2"].copy()
     temp_CO2 = data_proc_pm2["CO2"].copy()
-    #data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].to_array()
+    # data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].to_array()
     # .pr.sum(dim="variable", skipna=True, min_count=1)
     # .pr.sum(dim="variable", skipna=True, min_count=1)
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
     data_proc_pm2["CO2"].attrs = temp_CO2.attrs
     data_proc_pm2["CO2"].attrs = temp_CO2.attrs
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
 
 
     # actual processing
     # actual processing
     country_processing_step1 = {
     country_processing_step1 = {
-        'aggregate_cats': cats_to_agg,
+        "aggregate_cats": cats_to_agg,
     }
     }
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         gas_baskets={},
         processing_info_country=country_processing_step1,
         processing_info_country=country_processing_step1,
     )
     )
 
 
     country_processing_step2 = {
     country_processing_step2 = {
-        'downscale': downscaling,
-        'basket_copy': basket_copy,
+        "downscale": downscaling,
+        "basket_copy": basket_copy,
     }
     }
 
 
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
@@ -289,16 +332,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
     # TODO: processing info is present twice
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -307,9 +350,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Malaysia/__init__.py

@@ -0,0 +1,30 @@
+"""Read Malaysia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MYS'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MYS
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 922 - 602
src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur3.py

@@ -1,16 +1,22 @@
+"""Config for Malaysia's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 gwp_to_use = "AR4GWP100"
 gwp_to_use = "AR4GWP100"
 
 
 
 
 cat_names_fix = {
 cat_names_fix = {
-    '2A3 Glass Prod.': '2A3 Glass Production',
-    '2F6 Other Applications': '2F6 Other Applications (please specify)',
-    '3A2 Manure Mngmt': '3A2 Manure Mngmt.',
-    '3C7 Rice Cultivations': '3C7 Rice Cultivation',
+    "2A3 Glass Prod.": "2A3 Glass Production",
+    "2F6 Other Applications": "2F6 Other Applications (please specify)",
+    "3A2 Manure Mngmt": "3A2 Manure Mngmt.",
+    "3C7 Rice Cultivations": "3C7 Rice Cultivation",
 }
 }
 
 
 values_replacement = {
 values_replacement = {
-    '': '-',
-    ' ': '-',
+    "": "-",
+    " ": "-",
 }
 }
 
 
 cols_for_space_stripping = ["Categories"]
 cols_for_space_stripping = ["Categories"]
@@ -18,25 +24,25 @@ cols_for_space_stripping = ["Categories"]
 index_cols = ["Categories", "entity", "unit"]
 index_cols = ["Categories", "entity", "unit"]
 
 
 # parameters part 2: conversion to interchange format
 # parameters part 2: conversion to interchange format
-cats_remove = ['Memo items', 'Information items']
+cats_remove = ["Memo items", "Information items"]
 
 
 cat_codes_manual = {
 cat_codes_manual = {
-    'Annual change in long-term storage of carbon in HWP waste': 'M.LTS.AC.HWP',
-    'Annual change in total long-term storage of carbon stored': 'M.LTS.AC.TOT',
-    'CO2 captured': 'M.CCS',
-    'CO2 from Biomass Burning for Energy Production': 'M.BIO',
-    'For domestic storage': 'M.CCS.DOM',
-    'For storage in other countries': 'M.CCS.OCT',
-    'International Aviation (International Bunkers)': 'M.BK.A',
-    'International Bunkers': 'M.BK',
-    'International Water-borne Transport (International Bunkers)': 'M.BK.M',
-    'Long-term storage of carbon in waste disposal sites': 'M.LTS.WASTE',
-    'Multilateral Operations': 'M.MULTIOP',
-    'Other (please specify)': 'M.OTHER',
-    'Total National Emissions and Removals': '0',
+    "Annual change in long-term storage of carbon in HWP waste": "M.LTS.AC.HWP",
+    "Annual change in total long-term storage of carbon stored": "M.LTS.AC.TOT",
+    "CO2 captured": "M.CCS",
+    "CO2 from Biomass Burning for Energy Production": "M.BIO",
+    "For domestic storage": "M.CCS.DOM",
+    "For storage in other countries": "M.CCS.OCT",
+    "International Aviation (International Bunkers)": "M.BK.A",
+    "International Bunkers": "M.BK",
+    "International Water-borne Transport (International Bunkers)": "M.BK.M",
+    "Long-term storage of carbon in waste disposal sites": "M.LTS.WASTE",
+    "Multilateral Operations": "M.MULTIOP",
+    "Other (please specify)": "M.OTHER",
+    "Total National Emissions and Removals": "0",
 }
 }
 
 
-cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,4})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,4})\s.*"
 
 
 coords_terminologies = {
 coords_terminologies = {
     "area": "ISO3",
     "area": "ISO3",
@@ -48,17 +54,12 @@ coords_defaults = {
     "source": "MYS-GHG-inventory",
     "source": "MYS-GHG-inventory",
     "provenance": "measured",
     "provenance": "measured",
     "area": "MYS",
     "area": "MYS",
-    "scenario": "BUR3"
+    "scenario": "BUR3",
 }
 }
 
 
-coords_value_mapping = {
-}
+coords_value_mapping = {}
 
 
-coords_cols = {
-    "category": "Categories",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "Categories", "entity": "entity", "unit": "unit"}
 
 
 add_coords_cols = {
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
     "orig_cat_name": ["orig_cat_name", "category"],
@@ -76,600 +77,919 @@ meta_data = {
 terminology_proc = coords_terminologies["category"]
 terminology_proc = coords_terminologies["category"]
 
 
 table_def_templates = {
 table_def_templates = {
-    '184': { #184
-        "area": ['54,498,793,100'],
-        "cols": ['150,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-               ],
-        },
-    },
-    '185': { #184
-        "area": ['34,504,813,99'],
-        "cols": ['128,177,224,273,321,373,425,473,519,564,611,661,713,765'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A Mineral',
-                '2A1 Cement', '2A2 Lime',
-               ],
-        },
-    },
-    '186': { #also 200
-        "area": ['53,498,786,104'],
-        "cols": ['150,197,238,296,347,396,444,489,540,587,634,686,739'],
-        "rows_to_fix": {
-            3: ['2A3 Glass', '2A4 Other Process', '2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2B9 Fluorochemical'],
-        },
-    },
-    '187': { # also 201
-        "area": ['39,499,807,91'],
-        "cols": ['132,185,232,280,327,375,425,470,522,568,613,664,713,763'],
-        "rows_to_fix": {
-            3: ['2A3 Glass', '2A4 Other Process', '2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys',
-               ],
-            2: ['2B9 Fluorochemical'],
-            5: ['2B4 Caprolactam,'],
-        },
-    },
-    '188': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C4 Magnesium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-                '2F1 Refrigeration',
-               ],
-            2: ['2E2 TFT Flat Panel', '2E4 Heat Transfer'],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '189': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C4 Magnesium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-                '2F1 Refrigeration',
-               ],
-            2: ['2E2 TFT Flat Panel', '2E4 Heat Transfer'],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '190': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,453,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F2 Foam Blowing', '2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2G1 Electrical', '2G3 N2O from', '3A1 Enteric'],
-        },
-    },
-    '191': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F2 Foam Blowing', '2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2G1 Electrical', '2G3 N2O from', '3A1 Enteric'],
-        },
-    },
-    '192': {
-        "area": ['39,502,807,106'],
-        "cols": ['134,193,245,296,346,400,455,507,556,602,650,701,755'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please', '3D1 Harvested Wood',
-                '3D2 Other (please',
-               ],
-            5: ['3C Aggregate',],
-        },
-    },
-    '193': {
-        "area": ['36,508,815,119'],
-        "cols": ['128,179,228,278,327,379,428,476,525,571,622,670,717,766'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please', '3D1 Harvested',
-                '3D2 Other (please',
-               ],
-            5: ['3C Aggregate',],
-        },
-    },
-    '194': {
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',],
-            2: ['4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4B Biological Treatment', '4D Wastewater', '4D1 Domestic Wastewater',
-                '4D2 Industrial Wastewater',
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '195': {
-        "area": ['78,508,765,103'],
-        "cols": ['191,230,271,314,352,400,438,475,519,566,600,645,686,730'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4B Biological', '4D Wastewater', '4D1 Domestic',
-                '4D2 Industrial', '5B Other (please'
-               ],
-            2: ['4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised',
-                '4A Solid Waste',
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '196': {
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-borne',
-                'CO2 from Biomass Burning', 'For storage in other',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-        },
-    },
-    '197': {
-        "area": ['74,507,779,201'],
-        "cols": ['182,226,268,311,354,398,444,482,524,565,610,654,693,733'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-',
-                'CO2 from Biomass', 'For storage in other',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-        },
-    },
-    '198': { # first CH4 table
-        "area": ['54,498,793,100'],
-        "cols": ['140,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-               ],
-            -3: ['2A Mineral Industry'],
-        },
-    },
-    '199': {
-        "area": ['34,506,818,97'],
-        "cols": ['132,177,228,276,329,377,432,479,528,574,618,667,722,774'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-                '2A Mineral', '2A2 Lime',
-               ],
-        },
-    },
-    '202': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-               ],
-            2: ['2C4 Magnesium', '2E2 TFT Flat Panel', '2E4 Heat Transfer',
-                '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '203': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-               ],
-            2: ['2C4 Magnesium', '2E2 TFT Flat Panel', '2E4 Heat Transfer',
-                '2F1 Refrigeration'
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '204': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,455,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-                '3A1 Enteric',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G3 N2O from'],
-        },
-    },
-    '205': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-                '3A1 Enteric',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G3 N2O from'],
-        },
-    },
-    '206': { #also 220
-        "area": ['39,502,807,106'],
-        "cols": ['134,193,245,296,346,400,455,507,556,602,650,701,755'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please',
-                '3D2 Other (please',
-               ],
-            2: ['3D1 Harvested Wood',],
-            5: ['3C Aggregate',],
-        },
-    },
-    '207': { # also 221
-        "area": ['36,508,815,110'],
-        "cols": ['128,179,228,278,327,379,428,476,527,571,622,670,717,766'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please',
-                '3D2 Other (please',
-               ],
-            2: ['3D1 Harvested',],
-            5: ['3C Aggregate',],
-        },
-    },
-    '208': { # also 222
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4B Biological Treatment', '4D Wastewater', '4D1 Domestic Wastewater',
-                '4D2 Industrial Wastewater'
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '209': { # also 223
-        "area": ['78,508,765,103'],
-        "cols": ['191,230,271,314,352,400,438,475,519,560,600,645,686,730'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4B Biological', '4D Wastewater', '4D1 Domestic',
-                '4D2 Industrial', '5B Other (please',
-                '4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised',
-                '4A Solid Waste'
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '210': { # also 224
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-borne',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-            2: ['CO2 from Biomass Burning', 'For storage in other',],
-        },
-    },
-    '211': { # also 225
-        "area": ['74,507,779,201'],
-        "cols": ['182,226,268,311,354,398,444,482,524,565,610,654,693,733'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-', 'CO2 from Biomass',
-               ],
-            2: ['For storage in other',],
-        },
-    },
-    '212': {
-        "area": ['54,498,793,100'],
-        "cols": ['150,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL',
-               ],
-            2: ['2A1 Cement',],
-        },
-    },
-    '213': {
-        "area": ['34,504,813,99'],
-        "cols": ['128,177,224,273,321,373,425,473,519,564,611,661,713,765'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A Mineral',
-               ],
-            2: ['2A1 Cement', '2A2 Lime',],
-        },
-    },
-    '214': {
-        "area": ['47,499,801,93'],
-        "cols": ['141,197,246,297,350,396,453,502,550,595,642,692,748'],
-        "rows_to_fix": {
-            3: ['2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2A3 Glass', '2A4 Other Process', '2B9 Fluorochemical'],
-            -3: ['2C Metal Industry'],
-        },
-    },
-    '215': {
-        "area": ['39,499,807,91'],
-        "cols": ['132,180,232,280,327,375,425,470,522,568,613,664,713,763'],
-        "rows_to_fix": {
-            3: ['2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium Dioxide', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2A4 Other Process', '2B9 Fluorochemical'],
-            -3: ['2C Metal Industry'],
-        },
-    },
-    '216': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C7 Other (please', '2D Non-Energy', '2D2 Paraffin Wax',
-                '2D4 Other (please', '2E Electronics', '2E1 Integrated',
-                '2E5 Other (please',
-               ],
-            2: ['2C3 Aluminium', '2C4 Magnesium', '2E2 TFT Flat Panel',
-                '2E4 Heat Transfer', '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '217': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C7 Other (please', '2D Non-Energy', '2D2 Paraffin Wax',
-                '2D4 Other (please', '2E Electronics', '2E1 Integrated',
-                '2E5 Other (please',
-               ],
-            2: ['2C3 Aluminium', '2C4 Magnesium', '2E2 TFT Flat Panel',
-                '2E4 Heat Transfer', '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '218': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,455,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product', '2G2 SF6 and PFCs',
-                '2G3 N2O from', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G4 Other (Please',
-                '2H1 Pulp and Paper', '2H2 Food and', '3A1 Enteric',],
-        },
-    },
-    '219': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product', '2G2 SF6 and PFCs',
-                '2G3 N2O from', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G4 Other (Please',
-                '2H1 Pulp and Paper', '2H2 Food and', '3A1 Enteric',],
-        },
-    },
-    '226': { # also 334, 238
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '227': { # also 331, 335, 339
-        "area": ['27,510,818,99'],
-        "cols": ['250,290,333,372,413,452,494,536,576,616,656,699,739,781'],
-        "rows_to_fix": {
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '228': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
-        },
-    },
-    '229': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
-        "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
+    "184": {  # 184
+        "area": ["54,498,793,100"],
+        "cols": ["150,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+            ],
+        },
+    },
+    "185": {  # 184
+        "area": ["34,504,813,99"],
+        "cols": ["128,177,224,273,321,373,425,473,519,564,611,661,713,765"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A Mineral",
+                "2A1 Cement",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "186": {  # also 200
+        "area": ["53,498,786,104"],
+        "cols": ["150,197,238,296,347,396,444,489,540,587,634,686,739"],
+        "rows_to_fix": {
+            3: [
+                "2A3 Glass",
+                "2A4 Other Process",
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2B9 Fluorochemical"],
+        },
+    },
+    "187": {  # also 201
+        "area": ["39,499,807,91"],
+        "cols": ["132,185,232,280,327,375,425,470,522,568,613,664,713,763"],
+        "rows_to_fix": {
+            3: [
+                "2A3 Glass",
+                "2A4 Other Process",
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2B9 Fluorochemical"],
+            5: ["2B4 Caprolactam,"],
+        },
+    },
+    "188": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+                "2F1 Refrigeration",
+            ],
+            2: ["2E2 TFT Flat Panel", "2E4 Heat Transfer"],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "189": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+                "2F1 Refrigeration",
+            ],
+            2: ["2E2 TFT Flat Panel", "2E4 Heat Transfer"],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "190": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,453,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F2 Foam Blowing",
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: ["2G1 Electrical", "2G3 N2O from", "3A1 Enteric"],
+        },
+    },
+    "191": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F2 Foam Blowing",
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: ["2G1 Electrical", "2G3 N2O from", "3A1 Enteric"],
+        },
+    },
+    "192": {
+        "area": ["39,502,807,106"],
+        "cols": ["134,193,245,296,346,400,455,507,556,602,650,701,755"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D1 Harvested Wood",
+                "3D2 Other (please",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "193": {
+        "area": ["36,508,815,119"],
+        "cols": ["128,179,228,278,327,379,428,476,525,571,622,670,717,766"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D1 Harvested",
+                "3D2 Other (please",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "194": {
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+            ],
+            2: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4B Biological Treatment",
+                "4D Wastewater",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "195": {
+        "area": ["78,508,765,103"],
+        "cols": ["191,230,271,314,352,400,438,475,519,566,600,645,686,730"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4B Biological",
+                "4D Wastewater",
+                "4D1 Domestic",
+                "4D2 Industrial",
+                "5B Other (please",
+            ],
+            2: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised",
+                "4A Solid Waste",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "196": {
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-borne",
+                "CO2 from Biomass Burning",
+                "For storage in other",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+        },
+    },
+    "197": {
+        "area": ["74,507,779,201"],
+        "cols": ["182,226,268,311,354,398,444,482,524,565,610,654,693,733"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-",
+                "CO2 from Biomass",
+                "For storage in other",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+        },
+    },
+    "198": {  # first CH4 table
+        "area": ["54,498,793,100"],
+        "cols": ["140,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+            ],
+            -3: ["2A Mineral Industry"],
+        },
+    },
+    "199": {
+        "area": ["34,506,818,97"],
+        "cols": ["132,177,228,276,329,377,432,479,528,574,618,667,722,774"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+                "2A Mineral",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "202": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "203": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "204": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,455,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+                "3A1 Enteric",
+            ],
+            2: ["2F2 Foam Blowing", "2G1 Electrical", "2G3 N2O from"],
+        },
+    },
+    "205": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+                "3A1 Enteric",
+            ],
+            2: ["2F2 Foam Blowing", "2G1 Electrical", "2G3 N2O from"],
+        },
+    },
+    "206": {  # also 220
+        "area": ["39,502,807,106"],
+        "cols": ["134,193,245,296,346,400,455,507,556,602,650,701,755"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D2 Other (please",
+            ],
+            2: [
+                "3D1 Harvested Wood",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "207": {  # also 221
+        "area": ["36,508,815,110"],
+        "cols": ["128,179,228,278,327,379,428,476,527,571,622,670,717,766"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D2 Other (please",
+            ],
+            2: [
+                "3D1 Harvested",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "208": {  # also 222
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4B Biological Treatment",
+                "4D Wastewater",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "209": {  # also 223
+        "area": ["78,508,765,103"],
+        "cols": ["191,230,271,314,352,400,438,475,519,560,600,645,686,730"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4B Biological",
+                "4D Wastewater",
+                "4D1 Domestic",
+                "4D2 Industrial",
+                "5B Other (please",
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised",
+                "4A Solid Waste",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "210": {  # also 224
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-borne",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+            2: [
+                "CO2 from Biomass Burning",
+                "For storage in other",
+            ],
+        },
+    },
+    "211": {  # also 225
+        "area": ["74,507,779,201"],
+        "cols": ["182,226,268,311,354,398,444,482,524,565,610,654,693,733"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+                "CO2 from Biomass",
+            ],
+            2: [
+                "For storage in other",
+            ],
+        },
+    },
+    "212": {
+        "area": ["54,498,793,100"],
+        "cols": ["150,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+            ],
+            2: [
+                "2A1 Cement",
+            ],
+        },
+    },
+    "213": {
+        "area": ["34,504,813,99"],
+        "cols": ["128,177,224,273,321,373,425,473,519,564,611,661,713,765"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A Mineral",
+            ],
+            2: [
+                "2A1 Cement",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "214": {
+        "area": ["47,499,801,93"],
+        "cols": ["141,197,246,297,350,396,453,502,550,595,642,692,748"],
+        "rows_to_fix": {
+            3: [
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2A3 Glass", "2A4 Other Process", "2B9 Fluorochemical"],
+            -3: ["2C Metal Industry"],
+        },
+    },
+    "215": {
+        "area": ["39,499,807,91"],
+        "cols": ["132,180,232,280,327,375,425,470,522,568,613,664,713,763"],
+        "rows_to_fix": {
+            3: [
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium Dioxide",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2A4 Other Process", "2B9 Fluorochemical"],
+            -3: ["2C Metal Industry"],
+        },
+    },
+    "216": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "217": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "218": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,455,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G3 N2O from",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: [
+                "2F2 Foam Blowing",
+                "2G1 Electrical",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "3A1 Enteric",
+            ],
+        },
+    },
+    "219": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G3 N2O from",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: [
+                "2F2 Foam Blowing",
+                "2G1 Electrical",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "3A1 Enteric",
+            ],
+        },
+    },
+    "226": {  # also 334, 238
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "227": {  # also 331, 335, 339
+        "area": ["27,510,818,99"],
+        "cols": ["250,290,333,372,413,452,494,536,576,616,656,699,739,781"],
+        "rows_to_fix": {
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "228": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            3: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+        },
+    },
+    "229": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
+        "rows_to_fix": {
+            3: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+        },
+    },
+    "230": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            -3: [
+                "Total National Emissions and Removals",
+                "2 INDUSTRIAL PROCESSES AND PRODUCT USE",
+            ],
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "232": {  # also 236
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            -3: [
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
         },
         },
     },
     },
-    '230': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            -3: ['Total National Emissions and Removals', '2 INDUSTRIAL PROCESSES AND PRODUCT USE'],
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '232': { # also 236
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            -3: ['2G2 SF6 and PFCs from Other Product Uses',],
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone',]
-        },
-    },
-    '233': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
-        "rows_to_fix": {
-            -5: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
-            -3: ['2G Other Product Manufacture and Use',
-                 '2G2 SF6 and PFCs from Other Product Uses',]
+    "233": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
+        "rows_to_fix": {
+            -5: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+            -3: [
+                "2G Other Product Manufacture and Use",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
         },
     },
     },
-    '237': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
+    "237": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone'],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
         },
         },
     },
     },
-    '240': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
+    "240": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone'],
-            -3: ['2E Electronics Industry',
-                 '2F1 Refrigeration and Air Conditioning',
-                 '2G2 SF6 and PFCs from Other Product Uses',],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
+            -3: [
+                "2E Electronics Industry",
+                "2F1 Refrigeration and Air Conditioning",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
         },
     },
     },
-    '241': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
+    "241": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone',
-                '2E1 Integrated Circuit or Semiconductor',],
-            -3: ['2F1 Refrigeration and Air Conditioning',
-                 '2G2 SF6 and PFCs from Other Product Uses',],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+                "2E1 Integrated Circuit or Semiconductor",
+            ],
+            -3: [
+                "2F1 Refrigeration and Air Conditioning",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
         },
     },
     },
 }
 }
 
 
 table_defs = {
 table_defs = {
-    '184': {"template": '184', "entity": "CO2", "unit": "Gg CO2 / yr"}, #CO2
-    '185': {"template": '185', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '186': {"template": '186', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '187': {"template": '187', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '188': {"template": '188', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '189': {"template": '189', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '190': {"template": '190', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '191': {"template": '191', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '192': {"template": '192', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '193': {"template": '193', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '194': {"template": '194', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '195': {"template": '195', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '196': {"template": '196', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '197': {"template": '197', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '198': {"template": '198', "entity": "CH4", "unit": "Gg CH4 / yr"}, #CH4
-    '199': {"template": '199', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '200': {"template": '186', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '201': {"template": '187', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '202': {"template": '202', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '203': {"template": '203', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '204': {"template": '204', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '205': {"template": '205', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '206': {"template": '206', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '207': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '208': {"template": '208', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '209': {"template": '209', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '210': {"template": '210', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '211': {"template": '211', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '212': {"template": '212', "entity": "N2O", "unit": "Gg N2O / yr"}, #N2O
-    '213': {"template": '213', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '214': {"template": '214', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '215': {"template": '215', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '216': {"template": '216', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '217': {"template": '217', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '218': {"template": '218', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '219': {"template": '219', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '220': {"template": '206', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '221': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '222': {"template": '208', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '223': {"template": '209', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '224': {"template": '210', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '225': {"template": '211', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '226': {"template": '226', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"}, #HFCs
-    '227': {"template": '227', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '228': {"template": '228', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '229': {"template": '229', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '230': {"template": '230', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"}, #PFCs
-    '231': {"template": '227', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '232': {"template": '232', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '233': {"template": '233', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '234': {"template": '226', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"}, #SF6
-    '235': {"template": '227', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '236': {"template": '232', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '237': {"template": '237', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '238': {"template": '226', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"}, #NF3
-    '239': {"template": '227', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '240': {"template": '240', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '241': {"template": '241', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "184": {"template": "184", "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
+    "185": {"template": "185", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "186": {"template": "186", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "187": {"template": "187", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "188": {"template": "188", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "189": {"template": "189", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "190": {"template": "190", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "191": {"template": "191", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "192": {"template": "192", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "193": {"template": "193", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "194": {"template": "194", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "195": {"template": "195", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "196": {"template": "196", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "197": {"template": "197", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "198": {"template": "198", "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
+    "199": {"template": "199", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "200": {"template": "186", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "201": {"template": "187", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "202": {"template": "202", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "203": {"template": "203", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "204": {"template": "204", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "205": {"template": "205", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "206": {"template": "206", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "207": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "208": {"template": "208", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "209": {"template": "209", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "210": {"template": "210", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "211": {"template": "211", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "212": {"template": "212", "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
+    "213": {"template": "213", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "214": {"template": "214", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "215": {"template": "215", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "216": {"template": "216", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "217": {"template": "217", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "218": {"template": "218", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "219": {"template": "219", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "220": {"template": "206", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "221": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "222": {"template": "208", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "223": {"template": "209", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "224": {"template": "210", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "225": {"template": "211", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "226": {
+        "template": "226",
+        "entity": "HFCS (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # HFCs
+    "227": {"template": "227", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "228": {"template": "228", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "229": {"template": "229", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "230": {
+        "template": "230",
+        "entity": "PFCS (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # PFCs
+    "231": {"template": "227", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "232": {"template": "232", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "233": {"template": "233", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "234": {
+        "template": "226",
+        "entity": "SF6 (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # SF6
+    "235": {"template": "227", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "236": {"template": "232", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "237": {"template": "237", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "238": {
+        "template": "226",
+        "entity": "NF3 (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # NF3
+    "239": {"template": "227", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "240": {"template": "240", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "241": {"template": "241", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
 }
 }
 
 
 country_processing_step1 = {
 country_processing_step1 = {
-    'aggregate_cats': {
-        'M.3.C.AG': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                 '3.C.6', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land '
-                             '(Agriculture)'},
-        'M.3.D.AG': {'sources': ['3.D.2'],
-                     'name': 'Other (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG', 'M.3.D.AG'],
-                     'name': 'Agriculture excluding livestock'},
-        'M.AG': {'sources': ['3.A', 'M.AG.ELV'],
-                     'name': 'Agriculture'},
-        'M.3.D.LU': {'sources': ['3.D.1'],
-                     'name': 'Other (LULUCF)'},
-        'M.LULUCF': {'sources': ['3.B', 'M.3.D.LU'],
-                     'name': 'LULUCF'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'],
-                     'name': 'National total emissions excluding LULUCF'},
-    },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "aggregate_cats": {
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land "
+            "(Agriculture)",
+        },
+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total emissions excluding LULUCF",
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }
 
 
 gas_baskets = {
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 }

+ 258 - 253
src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur4.py

@@ -1,3 +1,9 @@
+"""Config for Malaysia's BUR4
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 gwp_to_use = "AR4GWP100"
 gwp_to_use = "AR4GWP100"
 
 
 
 
@@ -9,8 +15,8 @@ cat_names_fix = {
 }
 }
 
 
 values_replacement = {
 values_replacement = {
-    '': '-',
-    ' ': '-',
+    "": "-",
+    " ": "-",
 }
 }
 
 
 cols_for_space_stripping = ["Categories"]
 cols_for_space_stripping = ["Categories"]
@@ -18,25 +24,25 @@ cols_for_space_stripping = ["Categories"]
 index_cols = ["Categories", "entity", "unit"]
 index_cols = ["Categories", "entity", "unit"]
 
 
 # parameters part 2: conversion to interchange format
 # parameters part 2: conversion to interchange format
-cats_remove = ['Memo items', 'Information items',  'Information items (1)']
+cats_remove = ["Memo items", "Information items", "Information items (1)"]
 
 
 cat_codes_manual = {
 cat_codes_manual = {
-    'Annual change in long-term storage of carbon in HWP waste': 'M.LTS.AC.HWP',
-    'Annual change in total long-term storage of carbon stored': 'M.LTS.AC.TOT',
-    'CO2 captured': 'M.CCS',
-    'CO2 from Biomass Burning for Energy Production': 'M.BIO',
-    'For domestic storage': 'M.CCS.DOM',
-    'For storage in other countries': 'M.CCS.OCT',
-    'International Aviation (International Bunkers)': 'M.BK.A',
-    'International Bunkers': 'M.BK',
-    'International Water-borne Transport (International Bunkers)': 'M.BK.M',
-    'Long-term storage of carbon in waste disposal sites': 'M.LTS.WASTE',
-    'Multilateral Operations': 'M.MULTIOP',
-    'Other (please specify)': 'M.OTHER',
-    'Total National Emissions and Removals': '0',
+    "Annual change in long-term storage of carbon in HWP waste": "M.LTS.AC.HWP",
+    "Annual change in total long-term storage of carbon stored": "M.LTS.AC.TOT",
+    "CO2 captured": "M.CCS",
+    "CO2 from Biomass Burning for Energy Production": "M.BIO",
+    "For domestic storage": "M.CCS.DOM",
+    "For storage in other countries": "M.CCS.OCT",
+    "International Aviation (International Bunkers)": "M.BK.A",
+    "International Bunkers": "M.BK",
+    "International Water-borne Transport (International Bunkers)": "M.BK.M",
+    "Long-term storage of carbon in waste disposal sites": "M.LTS.WASTE",
+    "Multilateral Operations": "M.MULTIOP",
+    "Other (please specify)": "M.OTHER",
+    "Total National Emissions and Removals": "0",
 }
 }
 
 
-cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,4})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,4})\s.*"
 
 
 
 
 coords_terminologies = {
 coords_terminologies = {
@@ -49,27 +55,22 @@ coords_defaults = {
     "source": "MYS-GHG-inventory",
     "source": "MYS-GHG-inventory",
     "provenance": "measured",
     "provenance": "measured",
     "area": "MYS",
     "area": "MYS",
-    "scenario": "BUR4"
+    "scenario": "BUR4",
 }
 }
 
 
-coords_value_mapping = {
-}
+coords_value_mapping = {}
 
 
-coords_cols = {
-    "category": "Categories",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "Categories", "entity": "entity", "unit": "unit"}
 
 
 add_coords_cols = {
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
     "orig_cat_name": ["orig_cat_name", "category"],
 }
 }
 
 
-#filter_remove = {
+# filter_remove = {
 #    "f1": {
 #    "f1": {
 #        "entity": ["CO2(grossemissions)", "CO2(removals)"],
 #        "entity": ["CO2(grossemissions)", "CO2(removals)"],
 #    },
 #    },
-#}
+# }
 
 
 meta_data = {
 meta_data = {
     "references": "https://unfccc.int/documents/624776",
     "references": "https://unfccc.int/documents/624776",
@@ -84,318 +85,322 @@ terminology_proc = coords_terminologies["category"]
 
 
 table_def_templates = {
 table_def_templates = {
     # CO2
     # CO2
-    '203': {  # 203, 249
-        "area": ['70,480,768,169'],
+    "203": {  # 203, 249
+        "area": ["70,480,768,169"],
     },
     },
-    '204': {  # 204
-        "area": ['70,500,763,141'],
+    "204": {  # 204
+        "area": ["70,500,763,141"],
     },
     },
-    '205': {  # 205, 209, 2014, 2018
-        "area": ['70,495,763,95'],
+    "205": {  # 205, 209, 2014, 2018
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['5A Indirect N2O emissions from the atmospheric deposition of'],
+            2: ["5A Indirect N2O emissions from the atmospheric deposition of"],
         },
         },
     },
     },
-    '206': {  # 206
-        "area": ['70,495,763,353'],
+    "206": {  # 206
+        "area": ["70,495,763,353"],
     },
     },
-    '207': {  # 207, 208, 211, 212, 213, 215, 217, 223, 227, 231,
+    "207": {  # 207, 208, 211, 212, 213, 215, 217, 223, 227, 231,
         # 251, 257, 259, 263, 265
         # 251, 257, 259, 263, 265
-        "area": ['70,495,763,95'],
+        "area": ["70,495,763,95"],
     },
     },
-    '216': {  #  216
-        "area": ['70,500,763,95'],
+    "216": {  #  216
+        "area": ["70,500,763,95"],
     },
     },
     # CH4
     # CH4
-    '219': {  # 219, 255
-        "area": ['70,480,768,100'],
+    "219": {  # 219, 255
+        "area": ["70,480,768,100"],
     },
     },
-    '220': {  # 220, 224, 228
-        "area": ['70,495,763,95'],
+    "220": {  # 220, 224, 228
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '221': {  # 221
-        "area": ['92,508,748,92'],
-        "cols": ['298,340,380,422,462,502,542,582,622,662,702'],
+    "221": {  # 221
+        "area": ["92,508,748,92"],
+        "cols": ["298,340,380,422,462,502,542,582,622,662,702"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
-            2: ['5A Indirect N2O emissions from the atmospheric'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
+            2: ["5A Indirect N2O emissions from the atmospheric"],
         },
         },
     },
     },
-    '222': {  # 222
-        "area": ['70,495,763,323'],
+    "222": {  # 222
+        "area": ["70,495,763,323"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['Annual change in long-term storage of carbon in HWP'],
+            2: ["Annual change in long-term storage of carbon in HWP"],
         },
         },
     },
     },
-    '225': {  # 225
-        "area": ['92,508,748,92'],
-        "cols": ['311,357,400,443,486,529,572,615,658,701'],
+    "225": {  # 225
+        "area": ["92,508,748,92"],
+        "cols": ["311,357,400,443,486,529,572,615,658,701"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
         },
     },
     },
-    '226': {  # 226, 230
-        "area": ['70,495,763,95'],
+    "226": {  # 226, 230
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['5A Indirect N2O emissions from the atmospheric',
-                'Annual change in long-term storage of carbon in HWP'],
+            2: [
+                "5A Indirect N2O emissions from the atmospheric",
+                "Annual change in long-term storage of carbon in HWP",
+            ],
         },
         },
     },
     },
-    '229': {  # 229
-        "area": ['114,508,725,92'],
-        "cols": ['333,379,421,464,506,548,590,632,674'],
+    "229": {  # 229
+        "area": ["114,508,725,92"],
+        "cols": ["333,379,421,464,506,548,590,632,674"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
         },
     },
     },
     # N2O
     # N2O
-    '232': {  # 232
-        "area": ['70,495,763,95'],
-        "cols": ['315,366,416,466,516,566,616,666,716'],
+    "232": {  # 232
+        "area": ["70,495,763,95"],
+        "cols": ["315,366,416,466,516,566,616,666,716"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '233': {  # 233
-        "area": ['70,495,763,95'],
+    "233": {  # 233
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
         },
     },
     },
-    '234': {  # 234
-        "area": ['70,495,763,95'],
+    "234": {  # 234
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['International Water-borne Transport (International'],
+            3: ["International Water-borne Transport (International"],
         },
         },
     },
     },
-    '236': {  # 236
-        "area": ['70,495,763,95'],
-        "cols": ['298,344,392,439,487,534,580,629,675,721'],
+    "236": {  # 236
+        "area": ["70,495,763,95"],
+        "cols": ["298,344,392,439,487,534,580,629,675,721"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '240': {  # 240
-        "area": ['70,495,763,95'],
-        "cols": ['283,329,372,416,459,504,550,594,639,682,726'],
+    "240": {  # 240
+        "area": ["70,495,763,95"],
+        "cols": ["283,329,372,416,459,504,550,594,639,682,726"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
     # HFCs
     # HFCs
-    '243': {  # 243
-        "area": ['70,480,763,95'],
-        "cols": ['408,449,489,527,567,604,644,681,721'],
+    "243": {  # 243
+        "area": ["70,480,763,95"],
+        "cols": ["408,449,489,527,567,604,644,681,721"],
     },
     },
-    '244': {  # 244
-        "area": ['70,495,763,95'],
-        "cols": ['408,449,489,527,567,604,644,681,721'],
+    "244": {  # 244
+        "area": ["70,495,763,95"],
+        "cols": ["408,449,489,527,567,604,644,681,721"],
     },
     },
-    '245': {  # 245, 246
-        "area": ['70,495,763,95'],
-        "cols": ['405,442,478,515,550,587,621,657,693,729'],
+    "245": {  # 245, 246
+        "area": ["70,495,763,95"],
+        "cols": ["405,442,478,515,550,587,621,657,693,729"],
     },
     },
-    '247': {  # 247, 248
-        "area": ['70,495,763,95'],
-        "cols": ['384,426,459,493,531,564,597,633,666,700,735'],
+    "247": {  # 247, 248
+        "area": ["70,495,763,95"],
+        "cols": ["384,426,459,493,531,564,597,633,666,700,735"],
     },
     },
     # PFCs
     # PFCs
-    '250': {  # 250
-        "area": ['70,495,763,95'],
-        "cols": ['341,389,436,485,531,579,626,674,723'],
+    "250": {  # 250
+        "area": ["70,495,763,95"],
+        "cols": ["341,389,436,485,531,579,626,674,723"],
     },
     },
-    '252': {  # 252
-        "area": ['70,495,763,95'],
-        "cols": ['323,370,415,459,504,547,590,636,680,726'],
+    "252": {  # 252
+        "area": ["70,495,763,95"],
+        "cols": ["323,370,415,459,504,547,590,636,680,726"],
     },
     },
-    '253': {  # 253
-        "area": ['70,495,763,95'],
-        "cols": ['334,378,419,464,511,554,597,636,668,702,735'],
+    "253": {  # 253
+        "area": ["70,495,763,95"],
+        "cols": ["334,378,419,464,511,554,597,636,668,702,735"],
     },
     },
-    '254': {  # 254
-        "area": ['70,495,763,95'],
-        "cols": ['330,378,419,464,511,554,597,636,668,702,735'],
+    "254": {  # 254
+        "area": ["70,495,763,95"],
+        "cols": ["330,378,419,464,511,554,597,636,668,702,735"],
         "rows_to_fix": {
         "rows_to_fix": {
-            -3: ['2F Product Uses as Substitutes for Ozone Depleting Substances'],
+            -3: ["2F Product Uses as Substitutes for Ozone Depleting Substances"],
         },
         },
     },
     },
     # SF6
     # SF6
-    '256': {  # 256
-        "area": ['70,495,763,95'],
-        "cols": ['382,420,462,504,546,588,630,672,714'],
+    "256": {  # 256
+        "area": ["70,495,763,95"],
+        "cols": ["382,420,462,504,546,588,630,672,714"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '258': {  # 258
-        "area": ['70,495,763,95'],
-        "cols": ['363,399,441,481,522,564,606,646,688,728'],
+    "258": {  # 258
+        "area": ["70,495,763,95"],
+        "cols": ["363,399,441,481,522,564,606,646,688,728"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '260': {  # 260
-        "area": ['70,495,763,95'],
-        "cols": ['346,381,419,458,498,536,576,614,652,692,732'],
+    "260": {  # 260
+        "area": ["70,495,763,95"],
+        "cols": ["346,381,419,458,498,536,576,614,652,692,732"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
     # NF3
     # NF3
-    '261': {  # 261
-        "area": ['70,490,768,100'],
-        "cols": ['364,412,454,496,538,581,623,667,710'],
+    "261": {  # 261
+        "area": ["70,490,768,100"],
+        "cols": ["364,412,454,496,538,581,623,667,710"],
     },
     },
-    '262': {  # 262
-        "area": ['70,495,763,95'],
-        "cols": ['376,420,462,504,545,591,633,676,718'],
+    "262": {  # 262
+        "area": ["70,495,763,95"],
+        "cols": ["376,420,462,504,545,591,633,676,718"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '264': {  # 264
-        "area": ['70,495,763,95'],
-        "cols": ['370,415,451,491,530,569,609,651,689,729'],
+    "264": {  # 264
+        "area": ["70,495,763,95"],
+        "cols": ["370,415,451,491,530,569,609,651,689,729"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
-    '266': {  # 266
-        "area": ['70,495,763,95'],
-        "cols": ['355,392,430,467,505,544,580,619,656,695,732'],
+    "266": {  # 266
+        "area": ["70,495,763,95"],
+        "cols": ["355,392,430,467,505,544,580,619,656,695,732"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
         },
     },
     },
 }
 }
 
 
 table_defs = {
 table_defs = {
-    '203': {"template": '203', "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
-    '204': {"template": '204', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '205': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '206': {"template": '206', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '207': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '208': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '209': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '210': {"template": '206', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '211': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '212': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '213': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '214': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '215': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '216': {"template": '216', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '217': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '218': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '219': {"template": '219', "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
-    '220': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '221': {"template": '221', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '222': {"template": '222', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '223': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '224': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '225': {"template": '225', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '226': {"template": '226', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '227': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '228': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '229': {"template": '229', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '230': {"template": '226', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '231': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
-    '232': {"template": '232', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '233': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '234': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '235': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '236': {"template": '236', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '237': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '238': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '239': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '240': {"template": '240', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '241': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '242': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '243': {"template": '243', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # HFCs
-    '244': {"template": '244', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '245': {"template": '245', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '246': {"template": '245', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '247': {"template": '247', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '248': {"template": '247', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '249': {"template": '203', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # PFCs
-    '250': {"template": '250', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '251': {"template": '207', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '252': {"template": '252', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '253': {"template": '253', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '254': {"template": '254', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '255': {"template": '219', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # SF6
-    '256': {"template": '256', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '257': {"template": '207', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '258': {"template": '258', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '259': {"template": '207', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '260': {"template": '260', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '261': {"template": '261', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # NF3
-    '262': {"template": '262', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '263': {"template": '207', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '264': {"template": '264', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '265': {"template": '207', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '266': {"template": '266', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
+    "203": {"template": "203", "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
+    "204": {"template": "204", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "205": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "206": {"template": "206", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "207": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "208": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "209": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "210": {"template": "206", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "211": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "212": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "213": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "214": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "215": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "216": {"template": "216", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "217": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "218": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "219": {"template": "219", "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
+    "220": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "221": {"template": "221", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "222": {"template": "222", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "223": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "224": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "225": {"template": "225", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "226": {"template": "226", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "227": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "228": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "229": {"template": "229", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "230": {"template": "226", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "231": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
+    "232": {"template": "232", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "233": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "234": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "235": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "236": {"template": "236", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "237": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "238": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "239": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "240": {"template": "240", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "241": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "242": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "243": {
+        "template": "243",
+        "entity": f"HFCS ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # HFCs
+    "244": {"template": "244", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "245": {"template": "245", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "246": {"template": "245", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "247": {"template": "247", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "248": {"template": "247", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "249": {
+        "template": "203",
+        "entity": f"PFCS ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # PFCs
+    "250": {"template": "250", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "251": {"template": "207", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "252": {"template": "252", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "253": {"template": "253", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "254": {"template": "254", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "255": {
+        "template": "219",
+        "entity": f"SF6 ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # SF6
+    "256": {"template": "256", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "257": {"template": "207", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "258": {"template": "258", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "259": {"template": "207", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "260": {"template": "260", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "261": {
+        "template": "261",
+        "entity": f"NF3 ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # NF3
+    "262": {"template": "262", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "263": {"template": "207", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "264": {"template": "264", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "265": {"template": "207", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "266": {"template": "266", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
 }
 }
 
 
 country_processing_step1 = {
 country_processing_step1 = {
-    'aggregate_cats': {
-        'M.3.C.AG': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                 '3.C.6', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land '
-                             '(Agriculture)'},
-        'M.3.D.AG': {'sources': ['3.D.2'],
-                     'name': 'Other (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG', 'M.3.D.AG'],
-                     'name': 'Agriculture excluding livestock'},
-        'M.AG': {'sources': ['3.A', 'M.AG.ELV'],
-                     'name': 'Agriculture'},
-        'M.3.D.LU': {'sources': ['3.D.1'],
-                     'name': 'Other (LULUCF)'},
-        'M.LULUCF': {'sources': ['3.B', 'M.3.D.LU'],
-                     'name': 'LULUCF'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'],
-                     'name': 'National total emissions excluding LULUCF'},
-    },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "aggregate_cats": {
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land "
+            "(Agriculture)",
+        },
+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total emissions excluding LULUCF",
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }
 
 
 gas_baskets = {
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 }

+ 82 - 52
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py

@@ -1,8 +1,15 @@
-# this script reads data from Malaysia's BUR3
+"""
+Read Malaysia's BUR3 from pdf
+
+This script reads data from Malaysia's BUR3
+Data are read from pdf using camelot
+
+"""
+
 
 
 import camelot
 import camelot
 import primap2 as pm2
 import primap2 as pm2
-from .config_mys_bur3 import (
+from config_mys_bur3 import (
     cat_code_regexp,
     cat_code_regexp,
     cat_codes_manual,
     cat_codes_manual,
     cat_names_fix,
     cat_names_fix,
@@ -33,8 +40,8 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Malaysia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Malaysia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Malaysia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Malaysia"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
@@ -47,7 +54,7 @@ if __name__ == "__main__":
     # SF6: 234 - 237
     # SF6: 234 - 237
     # NF3: 238 - 241
     # NF3: 238 - 241
 
 
-    output_filename = 'MYS_BUR3_2020_'
+    output_filename = "MYS_BUR3_2020_"
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
     # ###
     # ###
@@ -62,36 +69,44 @@ if __name__ == "__main__":
         area = table_def_templates[page_template_nr]["area"]
         area = table_def_templates[page_template_nr]["area"]
         if "cols" in table_def_templates[page_template_nr].keys():
         if "cols" in table_def_templates[page_template_nr].keys():
             cols = table_def_templates[page_template_nr]["cols"]
             cols = table_def_templates[page_template_nr]["cols"]
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area, columns=cols,
-                                      split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
         else:
         else:
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+            )
 
 
         df_current = tables[0].df.copy()
         df_current = tables[0].df.copy()
-        df_current.iloc[0,0] = 'Categories'
+        df_current.iloc[0, 0] = "Categories"
         df_current.columns = df_current.iloc[0]
         df_current.columns = df_current.iloc[0]
         df_current = df_current.drop(0)
         df_current = df_current.drop(0)
         # replace double \n
         # replace double \n
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("\n", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
         # replace double and triple spaces
         # replace double and triple spaces
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("   ", " ")
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("  ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
 
         # fix the split rows
         # fix the split rows
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[page_template_nr]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[page_template_nr]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
 
         # replace category names with typos
         # replace category names with typos
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].replace(cat_names_fix)
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
 
 
         # replace empty stings
         # replace empty stings
         df_current = df_current.replace(values_replacement)
         df_current = df_current.replace(values_replacement)
@@ -106,7 +121,7 @@ if __name__ == "__main__":
         for col in cols_for_space_stripping:
         for col in cols_for_space_stripping:
             df_current[col] = df_current[col].str.strip()
             df_current[col] = df_current[col].str.strip()
 
 
-        # print(df_current.columns.values)
+        # print(df_current.columns.to_numpy())
 
 
         # aggregate dfs
         # aggregate dfs
         if df_all is None:
         if df_all is None:
@@ -118,10 +133,11 @@ if __name__ == "__main__":
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             # print(cols_both)
             # print(cols_both)
             if len(cols_both) > 0:
             if len(cols_both) > 0:
-                df_all = df_all.merge(df_current, how='outer', on=cols_both,
-                                      suffixes=(None, None))
+                df_all = df_all.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
             else:
             else:
-                df_all = df_all.merge(df_current, how='outer', suffixes=(None, None))
+                df_all = df_all.merge(df_current, how="outer", suffixes=(None, None))
             df_all = df_all.groupby(index_cols).first().reset_index()
             df_all = df_all.groupby(index_cols).first().reset_index()
             # df_all = df_all.join(df_current, how='outer')
             # df_all = df_all.join(df_current, how='outer')
 
 
@@ -137,28 +153,38 @@ if __name__ == "__main__":
     # replace cat names by codes in col "Categories"
     # replace cat names by codes in col "Categories"
     # first the manual replacements
     # first the manual replacements
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
+
     # then the regex repalcements
     # then the regex repalcements
-    def repl(m):
-        return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-    df_all["Categories"] = df_all["Categories"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+    df_all["Categories"] = df_all["Categories"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
 
 
     # make sure all col headers are str
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
     df_all.columns = df_all.columns.map(str)
 
 
     # remove thousands separators as pd.to_numeric can't deal with that
     # remove thousands separators as pd.to_numeric can't deal with that
     # also replace None with NaN
     # also replace None with NaN
-    year_cols = list(set(df_all.columns) - set(['Categories', 'entity', 'unit', 'orig_cat_name']))
+    year_cols = list(
+        set(df_all.columns) - set(["Categories", "entity", "unit", "orig_cat_name"])
+    )
     for col in year_cols:
     for col in year_cols:
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
-        def repl(m):
-            return m.group('part1') + m.group('part2')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_all[col][df_all[col].isnull()] = 'NaN'
+
+        def repl(m):  # noqa: D103
+            return m.group("part1") + m.group("part2")
+
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_all[col][df_all[col].isna()] = "NaN"
         # manually map code NENO to nan
         # manually map code NENO to nan
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('NENO','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('O NANaN','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NO','0')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NA NO I','0')
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("NENO", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("O NANaN", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NO", "0")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NA NO I", "0")
         # TODO: add code to PRIMAP2
         # TODO: add code to PRIMAP2
 
 
     # drop orig_cat_name as it's non-unique per category
     # drop orig_cat_name as it's non-unique per category
@@ -167,17 +193,17 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_wide_dataframe_if(
     data_if = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         df_all,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
-        #coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
 
@@ -190,12 +216,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # ## process the data
     # ## process the data
@@ -211,9 +240,9 @@ if __name__ == "__main__":
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -222,9 +251,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 84 - 55
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py

@@ -1,10 +1,16 @@
-# this script reads data from Malaysia's BUR4
-# code ist mostly identical to BUR3
+"""
+Read Malaysia's BUR3 from pdf
+
+This script reads data from Malaysia's BUR3
+Data are read from pdf using camelot
+
+Code ist mostly identical to BUR3
+"""
 
 
 
 
 import camelot
 import camelot
 import primap2 as pm2
 import primap2 as pm2
-from .config_mys_bur4 import (
+from config_mys_bur4 import (
     cat_code_regexp,
     cat_code_regexp,
     cat_codes_manual,
     cat_codes_manual,
     cat_names_fix,
     cat_names_fix,
@@ -35,8 +41,8 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Malaysia' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Malaysia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Malaysia" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Malaysia"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
@@ -50,7 +56,7 @@ if __name__ == "__main__":
     # SF6: 255 - 260
     # SF6: 255 - 260
     # NF3: 261 - 266
     # NF3: 261 - 266
 
 
-    output_filename = 'MYS_BUR4_2022_'
+    output_filename = "MYS_BUR4_2022_"
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
     # ###
     # ###
@@ -65,36 +71,44 @@ if __name__ == "__main__":
         area = table_def_templates[page_template_nr]["area"]
         area = table_def_templates[page_template_nr]["area"]
         if "cols" in table_def_templates[page_template_nr].keys():
         if "cols" in table_def_templates[page_template_nr].keys():
             cols = table_def_templates[page_template_nr]["cols"]
             cols = table_def_templates[page_template_nr]["cols"]
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area, columns=cols,
-                                      split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
         else:
         else:
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+            )
 
 
         df_current = tables[0].df.copy()
         df_current = tables[0].df.copy()
-        df_current.iloc[0,0] = 'Categories'
+        df_current.iloc[0, 0] = "Categories"
         df_current.columns = df_current.iloc[0]
         df_current.columns = df_current.iloc[0]
         df_current = df_current.drop(0)
         df_current = df_current.drop(0)
         # replace double \n
         # replace double \n
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("\n", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
         # replace double and triple spaces
         # replace double and triple spaces
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("   ", " ")
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("  ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
 
         # fix the split rows
         # fix the split rows
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[page_template_nr]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[page_template_nr]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
 
         # replace category names with typos
         # replace category names with typos
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].replace(cat_names_fix)
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
 
 
         # replace empty stings
         # replace empty stings
         df_current = df_current.replace(values_replacement)
         df_current = df_current.replace(values_replacement)
@@ -109,22 +123,23 @@ if __name__ == "__main__":
         for col in cols_for_space_stripping:
         for col in cols_for_space_stripping:
             df_current[col] = df_current[col].str.strip()
             df_current[col] = df_current[col].str.strip()
 
 
-        # print(df_current.columns.values)
+        # print(df_current.columns.to_numpy())
 
 
         # aggregate dfs
         # aggregate dfs
         if df_all is None:
         if df_all is None:
             df_all = df_current
             df_all = df_current
         else:
         else:
             # find intersecting cols
             # find intersecting cols
-            cols_all = df_all.columns.values
-            cols_current = df_current.columns.values
+            cols_all = df_all.columns.to_numpy()
+            cols_current = df_current.columns.to_numpy()
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             # print(cols_both)
             # print(cols_both)
             if len(cols_both) > 0:
             if len(cols_both) > 0:
-                df_all = df_all.merge(df_current, how='outer', on=cols_both,
-                                      suffixes=(None, None))
+                df_all = df_all.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
             else:
             else:
-                df_all = df_all.merge(df_current, how='outer', suffixes=(None, None))
+                df_all = df_all.merge(df_current, how="outer", suffixes=(None, None))
             df_all = df_all.groupby(index_cols).first().reset_index()
             df_all = df_all.groupby(index_cols).first().reset_index()
             # df_all = df_all.join(df_current, how='outer')
             # df_all = df_all.join(df_current, how='outer')
 
 
@@ -140,28 +155,38 @@ if __name__ == "__main__":
     # replace cat names by codes in col "Categories"
     # replace cat names by codes in col "Categories"
     # first the manual replacements
     # first the manual replacements
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
+
     # then the regex repalcements
     # then the regex repalcements
-    def repl(m):
-        return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-    df_all["Categories"] = df_all["Categories"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+    df_all["Categories"] = df_all["Categories"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
 
 
     # make sure all col headers are str
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
     df_all.columns = df_all.columns.map(str)
 
 
     # remove thousands separators as pd.to_numeric can't deal with that
     # remove thousands separators as pd.to_numeric can't deal with that
     # also replace None with NaN
     # also replace None with NaN
-    year_cols = list(set(df_all.columns) - set(['Categories', 'entity', 'unit', 'orig_cat_name']))
+    year_cols = list(
+        set(df_all.columns) - set(["Categories", "entity", "unit", "orig_cat_name"])
+    )
     for col in year_cols:
     for col in year_cols:
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
-        def repl(m):
-            return m.group('part1') + m.group('part2')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_all[col][df_all[col].isnull()] = 'NaN'
+
+        def repl(m):  # noqa: D103
+            return m.group("part1") + m.group("part2")
+
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_all[col][df_all[col].isna()] = "NaN"
         # manually map code NENO to nan
         # manually map code NENO to nan
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('NENO','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('O NANaN','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NO','0')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NA NO I','0')
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("NENO", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("O NANaN", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NO", "0")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NA NO I", "0")
         # TODO: add code to PRIMAP2
         # TODO: add code to PRIMAP2
 
 
     # drop orig_cat_name as it's non-unique per category
     # drop orig_cat_name as it's non-unique per category
@@ -170,17 +195,17 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_wide_dataframe_if(
     data_if = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         df_all,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
-        #coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
 
@@ -193,12 +218,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # ## process the data
     # ## process the data
@@ -214,9 +242,9 @@ if __name__ == "__main__":
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -225,9 +253,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Mexico/__init__.py

@@ -0,0 +1,30 @@
+"""Read Mexico's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MEX'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MEX
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 81 - 35
src/unfccc_ghg_data/unfccc_reader/Mexico/config_mex_bur3.py

@@ -1,8 +1,42 @@
+"""Config for Mexico's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 import pandas as pd
 import pandas as pd
 
 
 
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
-             n_rows: int) -> pd.DataFrame:
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
+    Combine split rows
+
+    This function combines rows which have been split into several rows during data
+    reading from pdf because they contained line breaks.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        The data to work with
+    rows_to_fix: list
+        List of values for which to fix rows
+    col_to_use: str
+        column to use to find the rows to merge
+    n_rows: int
+        How many rows to combine for each row found. e.g. 3 means combine the found
+        row with the following two rows. Negative values are used for more
+        complicated situations where the rows to merge are also before the position
+        of the value that indicates the merge. See code for details
+
+    Returns
+    -------
+        pandas DataFrame with combined rows. The individual rows are removed
+
+    TODO: move function to helper module (make sure to have one function that works
+     for all cases)
+    """
     for row in rows_to_fix:
     for row in rows_to_fix:
         # print(row)
         # print(row)
         # find the row number and collect the row and the next two rows
         # find the row number and collect the row and the next two rows
@@ -16,29 +50,29 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
         for item in index:
         for item in index:
             loc = data.index.get_loc(item)
             loc = data.index.get_loc(item)
             ####print(data[col_to_use].loc[loc + 1])
             ####print(data[col_to_use].loc[loc + 1])
-            if n_rows == -2:
+            if n_rows == -2:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 loc_to_check = loc - 1
                 loc_to_check = loc - 1
-            if n_rows == -6:
+            elif n_rows == -6:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 3, loc + 3))
                 locs_to_merge = list(range(loc - 3, loc + 3))
                 loc_to_check = loc - 3
                 loc_to_check = loc - 3
-            elif n_rows == -3:
+            elif n_rows == -3:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 2))
                 locs_to_merge = list(range(loc - 1, loc + 2))
                 loc_to_check = loc - 1
                 loc_to_check = loc - 1
             else:
             else:
                 locs_to_merge = list(range(loc, loc + n_rows))
                 locs_to_merge = list(range(loc, loc + n_rows))
                 loc_to_check = loc + 1
                 loc_to_check = loc + 1
 
 
-            if (data[col_to_use].loc[loc_to_check] == '') or n_rows == 2:
+            if (not data[col_to_use].loc[loc_to_check]) or n_rows == 2:  # noqa: PLR2004
                 rows_to_merge = data.iloc[locs_to_merge]
                 rows_to_merge = data.iloc[locs_to_merge]
                 indices_to_merge = rows_to_merge.index
                 indices_to_merge = rows_to_merge.index
                 # replace numerical NaN values
                 # replace numerical NaN values
                 ####print(rows_to_merge)
                 ####print(rows_to_merge)
-                rows_to_merge = rows_to_merge.fillna('')
+                rows_to_merge = rows_to_merge.fillna("")
                 ####print("fillna")
                 ####print("fillna")
                 ####print(rows_to_merge)
                 ####print(rows_to_merge)
                 # join the three rows
                 # join the three rows
-                new_row = rows_to_merge.agg(' '.join)
+                new_row = rows_to_merge.agg(" ".join)
                 # replace the double spaces that are created
                 # replace the double spaces that are created
                 # must be done here and not at the end as splits are not always
                 # must be done here and not at the end as splits are not always
                 # the same and join would produce different col values
                 # the same and join would produce different col values
@@ -54,67 +88,77 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
         data = data.reset_index(drop=True)
         data = data.reset_index(drop=True)
     return data
     return data
 
 
+
 page_defs = {
 page_defs = {
-    '118': {
+    "118": {
         "camelot": {
         "camelot": {
-            "table_areas": ['49,602,551,73'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,602,551,73"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "split_text": False,
             "flavor": "stream",
             "flavor": "stream",
         },
         },
         "rows_to_fix": {
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -6: ["Categorías de fuentes y"],
-            3: ["Todas las emisiones y las absorciones",
+            3: [
+                "Todas las emisiones y las absorciones",
                 "Todas las emisiones (sin [3B] Tierra ni",
                 "Todas las emisiones (sin [3B] Tierra ni",
                 "[1A] Actividades de quema del",
                 "[1A] Actividades de quema del",
                 "[1A2] Industrias manufactura y de la",
                 "[1A2] Industrias manufactura y de la",
                 "[1B] Emisiones fugitivas provenientes de",
                 "[1B] Emisiones fugitivas provenientes de",
-                "[2] Procesos industriales y uso de"],
+                "[2] Procesos industriales y uso de",
+            ],
         },
         },
     },
     },
-    '119': {
+    "119": {
         "camelot": {
         "camelot": {
-            "table_areas": ['49,650,551,77'],
-            "columns": ['228,275,317,352,394,421,446,483'],
+            "table_areas": ["49,650,551,77"],
+            "columns": ["228,275,317,352,394,421,446,483"],
             "split_text": True,
             "split_text": True,
             "flavor": "stream",
             "flavor": "stream",
         },
         },
         "rows_to_fix": {
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -6: ["Categorías de fuentes y"],
-            3: ["[2B4] Producción de caprolactama,",
+            3: [
+                "[2B4] Producción de caprolactama,",
                 "[2B8] Producción petroquímica y negro",
                 "[2B8] Producción petroquímica y negro",
                 "[2D] Uso de productos no energéticos de",
                 "[2D] Uso de productos no energéticos de",
-                "[2E1] Circuitos integrados o"],
+                "[2E1] Circuitos integrados o",
+            ],
         },
         },
     },
     },
-    '120': {
+    "120": {
         "camelot": {
         "camelot": {
-            "table_areas": ['49,650,551,77'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,77"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "split_text": False,
             "flavor": "stream",
             "flavor": "stream",
         },
         },
         "rows_to_fix": {
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -6: ["Categorías de fuentes y"],
             -3: ["[3B] Tierra"],
             -3: ["[3B] Tierra"],
-            3: ["[2F] Uso de productos sustitutos de las",
+            3: [
+                "[2F] Uso de productos sustitutos de las",
                 "[2G] Manufactura y utilización de otros",
                 "[2G] Manufactura y utilización de otros",
-                "[3] Agricultura, silvicultura y otros usos"],
-            2: ["[2H2] Industria de la alimentación y las",
-                "[2G2] SF₆ y PFC de otros usos de"],
+                "[3] Agricultura, silvicultura y otros usos",
+            ],
+            2: [
+                "[2H2] Industria de la alimentación y las",
+                "[2G2] SF₆ y PFC de otros usos de",
+            ],
         },
         },
     },
     },
-    '121': {
+    "121": {
         "camelot": {
         "camelot": {
-            "table_areas": ['49,650,551,70'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,70"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "split_text": False,
             "flavor": "stream",
             "flavor": "stream",
         },
         },
         "rows_to_fix": {
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -6: ["Categorías de fuentes y"],
             -3: ["[3B1] Tierra forestales"],
             -3: ["[3B1] Tierra forestales"],
-            3: ["[3C] Fuentes agregadas y fuentes de",
+            3: [
+                "[3C] Fuentes agregadas y fuentes de",
                 "[3C1] Emisiones de GEI por quemado de",
                 "[3C1] Emisiones de GEI por quemado de",
                 "[3C4] Emisiones directas de los N₂O de",
                 "[3C4] Emisiones directas de los N₂O de",
                 "[3C5] Emisiones indirectas de los N₂O de",
                 "[3C5] Emisiones indirectas de los N₂O de",
@@ -123,24 +167,26 @@ page_defs = {
                 "[4A2] Sitios no controlados de",
                 "[4A2] Sitios no controlados de",
                 "[4A3] Tiraderos a cielo abierto para",
                 "[4A3] Tiraderos a cielo abierto para",
                 "[4B] Tratamiento biológico de los",
                 "[4B] Tratamiento biológico de los",
-                ],
+            ],
         },
         },
     },
     },
-    '122': {
+    "122": {
         "camelot": {
         "camelot": {
-            "table_areas": ['49,650,551,404'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,404"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "split_text": False,
             "flavor": "stream",
             "flavor": "stream",
         },
         },
         "rows_to_fix": {
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -6: ["Categorías de fuentes y"],
-            3: ["[4C] Incineración y quema a cielo abierto",
+            3: [
+                "[4C] Incineración y quema a cielo abierto",
                 "[4C1] Incineración de residuos peligrosos",
                 "[4C1] Incineración de residuos peligrosos",
                 "[4C2] Quema a cielo abierto de residuos",
                 "[4C2] Quema a cielo abierto de residuos",
                 "[4D] Tratamiento y eliminación de aguas",
                 "[4D] Tratamiento y eliminación de aguas",
                 "[4D1] Tratamiento y eliminación de",
                 "[4D1] Tratamiento y eliminación de",
-                "[4D2] Tratamiento y eliminación de"],
+                "[4D2] Tratamiento y eliminación de",
+            ],
         },
         },
     },
     },
 }
 }

+ 63 - 66
src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py

@@ -1,10 +1,15 @@
-# this script reads data from Mexico's BUR3
-# Data is read from the pdf file
+"""
+Read Mexico's BUR3 from pdf
+
+This script reads data from Mexico's BUR3
+Data are read from pdf using camelot
+
+"""
 
 
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_mex_bur3 import fix_rows, page_defs
+from config_mex_bur3 import fix_rows, page_defs
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
 
@@ -12,16 +17,16 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
+    input_folder = downloaded_data_path / "UNFCCC" / "Mexico" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Mexico"
     if not output_folder.exists():
     if not output_folder.exists():
-       output_folder.mkdir()
+        output_folder.mkdir()
 
 
-    output_filename = 'MEX_BUR3_2022_'
+    output_filename = "MEX_BUR3_2022_"
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
-    inventory_file = 'Mexico_3er_BUR.pdf'
+    inventory_file = "Mexico_3er_BUR.pdf"
 
 
-    gwp_to_use = 'AR5GWP100'
+    gwp_to_use = "AR5GWP100"
     year = 2019
     year = 2019
     entity_row = 0
     entity_row = 0
     unit_row = 1
     unit_row = 1
@@ -43,12 +48,12 @@ if __name__ == "__main__":
 
 
     # manual category codes
     # manual category codes
     cat_codes_manual = {
     cat_codes_manual = {
-        'Todas las emisiones y las absorciones nacionales': '0',
-        'Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada': 'M0EL',
-        '2F6 Otras aplicaciones': '2F6',
+        "Todas las emisiones y las absorciones nacionales": "0",
+        "Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada": "M0EL",
+        "2F6 Otras aplicaciones": "2F6",
     }
     }
 
 
-    cat_code_regexp = r'^\[(?P<code>[a-zA-Z0-9]{1,3})\].*'
+    cat_code_regexp = r"^\[(?P<code>[a-zA-Z0-9]{1,3})\].*"
 
 
     coords_cols = {
     coords_cols = {
         "category": "category",
         "category": "category",
@@ -77,18 +82,17 @@ if __name__ == "__main__":
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "category": "PRIMAP1",
         "category": "PRIMAP1",
         "entity": {
         "entity": {
-            'CH₄': 'CH4',
-            'CO₂': 'CO2',
-            'EMISIONES NETAS PCG AR5': 'KYOTOGHG (AR5GWP100)',
-            'HFC': f"HFCS ({gwp_to_use})",
-            'NF₃': f"NF3 ({gwp_to_use})",
-            'N₂O': 'N2O',
-            'PFC': f"PFCS ({gwp_to_use})",
-            'SF₆': f"SF6 ({gwp_to_use})",
+            "CH₄": "CH4",
+            "CO₂": "CO2",
+            "EMISIONES NETAS PCG AR5": "KYOTOGHG (AR5GWP100)",
+            "HFC": f"HFCS ({gwp_to_use})",
+            "NF₃": f"NF3 ({gwp_to_use})",
+            "N₂O": "N2O",
+            "PFC": f"PFCS ({gwp_to_use})",
+            "SF₆": f"SF6 ({gwp_to_use})",
         },
         },
     }
     }
 
 
-
     filter_remove = {}
     filter_remove = {}
 
 
     filter_keep = {}
     filter_keep = {}
@@ -102,11 +106,6 @@ if __name__ == "__main__":
         "institution": "UNFCCC",
         "institution": "UNFCCC",
     }
     }
 
 
-    # convert to mass units where possible
-    entities_to_convert_to_mass = [
-        'NF3', 'SF6'
-    ]
-
     # ###
     # ###
     # read the data from pdf into one long format dataframe
     # read the data from pdf into one long format dataframe
     # ###
     # ###
@@ -114,8 +113,9 @@ if __name__ == "__main__":
     for page in page_defs.keys():
     for page in page_defs.keys():
         print(f"Working on page {page}")
         print(f"Working on page {page}")
         page_def = page_defs[page]
         page_def = page_defs[page]
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=page,
-                                  **page_def["camelot"])
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=page, **page_def["camelot"]
+        )
         df_this_table = tables[0].df
         df_this_table = tables[0].df
 
 
         # fix rows
         # fix rows
@@ -127,31 +127,36 @@ if __name__ == "__main__":
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
             # replace double space in entity
             # replace double space in entity
             df_this_table.iloc[0, :] = df_this_table.iloc[0, :].str.replace("  ", " ")
             df_this_table.iloc[0, :] = df_this_table.iloc[0, :].str.replace("  ", " ")
-            df_this_table = fix_rows(df_this_table, page_def["rows_to_fix"][n_rows], 0,
-                                     n_rows)
+            df_this_table = fix_rows(
+                df_this_table, page_def["rows_to_fix"][n_rows], 0, n_rows
+            )
 
 
         # add units
         # add units
-        for col in df_this_table.columns.values:
+        for col in df_this_table.columns.to_numpy():
             if df_this_table[col].iloc[0] in units.keys():
             if df_this_table[col].iloc[0] in units.keys():
                 df_this_table[col].iloc[1] = units[df_this_table[col].iloc[0]]
                 df_this_table[col].iloc[1] = units[df_this_table[col].iloc[0]]
 
 
         # bring in right format for conversion to long format
         # bring in right format for conversion to long format
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_unit=".*",
-                                                           regexp_entity=".*",
-                                                           default_unit="GgCO2eq")
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_unit=".*",
+            regexp_entity=".*",
+            default_unit="GgCO2eq",
+        )
 
 
         # set index and convert to long format
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year, header_long
+        )
 
 
         # combine with tables for other sectors (merge not append)
         # combine with tables for other sectors (merge not append)
         if df_all is None:
         if df_all is None:
             df_all = df_this_table_long
             df_all = df_this_table_long
         else:
         else:
-            df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
 
     # ###
     # ###
     # conversion to PM2 IF
     # conversion to PM2 IF
@@ -162,15 +167,19 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-       return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
 
 
     # replace "," and " " with "" in data
     # replace "," and " " with "" in data
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(',','', regex=False)
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ','', regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(",", "", regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(" ", "", regex=False)
 
 
     # make sure all col headers are str
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
     df_all.columns = df_all.columns.map(str)
@@ -185,12 +194,13 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+        time_format="%Y",
+    )
 
 
     cat_label = "category (IPCC2006)"
     cat_label = "category (IPCC2006)"
     # fix error cats
     # fix error cats
@@ -198,21 +208,6 @@ if __name__ == "__main__":
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
 
-    # convert to mass units from CO2eq
-
-    entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in
-                           entities_to_convert_to_mass]
-
-    for entity in entities_to_convert:
-        converted = data_pm2[entity].pr.convert_to_mass()
-        basic_entity = entity.split(" ")[0]
-        converted = converted.to_dataset(name=basic_entity)
-        data_pm2 = data_pm2.pr.merge(converted)
-        data_pm2[basic_entity].attrs["entity"] = basic_entity
-
-    # drop the GWP data
-    data_pm2 = data_pm2.drop_vars(entities_to_convert)
-
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
@@ -222,9 +217,11 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Montenegro/__init__.py

@@ -0,0 +1,30 @@
+"""Read Montenegro's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MNE'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MNE
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 103 - 47
src/unfccc_ghg_data/unfccc_reader/Montenegro/config_mne_bur3.py

@@ -1,67 +1,123 @@
+"""Config for Montenegro's BUR3
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 # most time series are contained twice and 2005 data is also contained twice. Some
 # most time series are contained twice and 2005 data is also contained twice. Some
 # data is inconsistent and we remove the time series with errors
 # data is inconsistent and we remove the time series with errors
 drop_data = {
 drop_data = {
-    2: { # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
-        "cats": ["1", "1.A", "1.A.1", "1.A.1", "1.A.2", "1.A.3", "1.A.4", "1.A.5", "1.B", "1.B.1", "1.B.2",
-                 "2", "2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H",
-                 "3", "3.A", "3.B"],
-        #"years": ["2005"], # 2005 data copy of 2019
+    2: {  # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
+        "cats": [
+            "1",
+            "1.A",
+            "1.A.1",
+            "1.A.1",
+            "1.A.2",
+            "1.A.3",
+            "1.A.4",
+            "1.A.5",
+            "1.B",
+            "1.B.1",
+            "1.B.2",
+            "2",
+            "2.A",
+            "2.B",
+            "2.C",
+            "2.D",
+            "2.E",
+            "2.F",
+            "2.G",
+            "2.H",
+            "3",
+            "3.A",
+            "3.B",
+        ],
+        # "years": ["2005"], # 2005 data copy of 2019
     },
     },
-    3: { # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
-        "cats": ["3.C", "3.D", "3.E", "3.F", "3.G", "5", "5.A", "5.B", "5.C", "5.D", "6"]
-        #"years": ["2005"],
+    3: {  # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
+        "cats": [
+            "3.C",
+            "3.D",
+            "3.E",
+            "3.F",
+            "3.G",
+            "5",
+            "5.A",
+            "5.B",
+            "5.C",
+            "5.D",
+            "6",
+        ]
+        # "years": ["2005"],
     },
     },
-    6: { #2005 data copy of 2019
+    6: {  # 2005 data copy of 2019
         "years": ["2005"],
         "years": ["2005"],
     },
     },
-    7: { # 2005 data copy of 2019 for 3.G
+    7: {  # 2005 data copy of 2019 for 3.G
         "years": ["2005"],
         "years": ["2005"],
     },
     },
-    25: { # 2005 data copy of 2019 (CO2, 2005-2019, first table)
+    25: {  # 2005 data copy of 2019 (CO2, 2005-2019, first table)
         "years": ["2005"],
         "years": ["2005"],
     },
     },
-    26: { # 2005 data copy of 2019 (CO2, 2005-2019, second table)
+    26: {  # 2005 data copy of 2019 (CO2, 2005-2019, second table)
         "years": ["2005"],
         "years": ["2005"],
     },
     },
 }
 }
 
 
 cat_mapping = {
 cat_mapping = {
-    '3': 'M.AG',
-    '3.A': '3.A.1',
-    '3.B': '3.A.2',
-    '3.C': '3.C.7', # rice
-    '3.D': 'M.3.C.45AG', # Agricultural soils
-    '3.E': '3.C.1.c', # prescribed burning of savanna
-    '3.F': '3.C.1.b', # field burning of agricultural residues
-    '3.G': '3.C.3', # urea application
-    '4': 'M.LULUCF',
-    '4.A': '3.B.1', # forest
-    '4.B': '3.B.2', # cropland
-    '4.C': '3.B.3', # grassland
-    '4.D': '3.B.4', # wetland
-    '4.E': '3.B.5', # Settlements
-    '4.F': '3.B.6', # other land
-    '4.G': '3.D.1', # HWP
-    '5': '4',
-    '5.A': '4.A',
-    '5.B': '4.B',
-    '5.C': '4.C',
-    '5.D': '4.D',
-    '6': '5',
+    "3": "M.AG",
+    "3.A": "3.A.1",
+    "3.B": "3.A.2",
+    "3.C": "3.C.7",  # rice
+    "3.D": "M.3.C.45AG",  # Agricultural soils
+    "3.E": "3.C.1.c",  # prescribed burning of savanna
+    "3.F": "3.C.1.b",  # field burning of agricultural residues
+    "3.G": "3.C.3",  # urea application
+    "4": "M.LULUCF",
+    "4.A": "3.B.1",  # forest
+    "4.B": "3.B.2",  # cropland
+    "4.C": "3.B.3",  # grassland
+    "4.D": "3.B.4",  # wetland
+    "4.E": "3.B.5",  # Settlements
+    "4.F": "3.B.6",  # other land
+    "4.G": "3.D.1",  # HWP
+    "5": "4",
+    "5.A": "4.A",
+    "5.B": "4.B",
+    "5.C": "4.C",
+    "5.D": "4.D",
+    "6": "5",
 }
 }
 
 
 aggregate_cats = {
 aggregate_cats = {
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    'M.3.C.1.AG': {'sources': ['3.C.1.c', '3.C.1.b'], 'name': 'Emissions from Biomass '
-                                                          'Burning (Agriculture)'},
-    '3.C.1': {'sources': ['3.C.1.c', '3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
-    '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.C.45AG', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.1.AG', '3.C.3', 'M.3.C.45AG', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
-    '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    '0': {'sources': ['1', '2', '3', '4', '5']},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "M.3.C.1.AG": {
+        "sources": ["3.C.1.c", "3.C.1.b"],
+        "name": "Emissions from Biomass " "Burning (Agriculture)",
+    },
+    "3.C.1": {
+        "sources": ["3.C.1.c", "3.C.1.b"],
+        "name": "Emissions from Biomass Burning",
+    },
+    "3.C": {
+        "sources": ["3.C.1", "3.C.3", "M.3.C.45AG", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.1.AG", "3.C.3", "M.3.C.45AG", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "3.D": {"sources": ["3.D.1"], "name": "Other"},
+    "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "0": {"sources": ["1", "2", "3", "4", "5"]},
 }
 }

+ 88 - 56
src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py

@@ -1,41 +1,41 @@
-# Montenegro BUR 3
-# Code to read the emissions inventory contained in Montenegro's third BUR from pdf
-# and convert into PRIMAP2 format
+"""
+Read Montenegro's BUR3 from pdf
+
+This script reads data from Montenegro's BUR3
+Data are read from pdf using camelot
+
+"""
+
 
 
 # ###
 # ###
 # imports
 # imports
 # ###
 # ###
 import copy
 import copy
 import re
 import re
-from pathlib import Path
 
 
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
+from config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
 if __name__ == "__main__":
 if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
 
 
-    # folders and files
-    root_path = Path(__file__).parents[3].absolute()
-    root_path = root_path.resolve()
-    downloaded_data_path = root_path / "downloaded_data"
-    extracted_data_path = root_path / "extracted_data"
-
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Montenegro' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Montenegro'
-    output_filename = 'MNE_BUR3_2022_'
+    input_folder = downloaded_data_path / "UNFCCC" / "Montenegro" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Montenegro"
+    output_filename = "MNE_BUR3_2022_"
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
-    inventory_file_pdf = 'NIR-2021_MNE_Finalversion.pdf'
+    inventory_file_pdf = "NIR-2021_MNE_Finalversion.pdf"
 
 
     # reading and processing
     # reading and processing
     years_to_read = range(1990, 2018 + 1)
     years_to_read = range(1990, 2018 + 1)
-    pages_to_read = range(535,583)
+    pages_to_read = range(535, 583)
 
 
     pos_entity = [0, 0]
     pos_entity = [0, 0]
     cat_code_col = 0
     cat_code_col = 0
@@ -43,7 +43,7 @@ if __name__ == "__main__":
     regex_unit = r"\((.*)\)"
     regex_unit = r"\((.*)\)"
     regex_entity = r"^(.*)\s\("
     regex_entity = r"^(.*)\s\("
 
 
-    gwp_to_use = 'AR4GWP100'
+    gwp_to_use = "AR4GWP100"
 
 
     # conversion to PRIMAP2 format
     # conversion to PRIMAP2 format
 
 
@@ -61,28 +61,28 @@ if __name__ == "__main__":
     }
     }
 
 
     coords_value_mapping = {
     coords_value_mapping = {
-        'unit': 'PRIMAP1',
-        'entity': {
+        "unit": "PRIMAP1",
+        "entity": {
             f"GHG ({gwp_to_use})": f"KYOTOGHG ({gwp_to_use})",
             f"GHG ({gwp_to_use})": f"KYOTOGHG ({gwp_to_use})",
             f"HFC ({gwp_to_use})": f"HFCS ({gwp_to_use})",
             f"HFC ({gwp_to_use})": f"HFCS ({gwp_to_use})",
             f"PFC ({gwp_to_use})": f"PFCS ({gwp_to_use})",
             f"PFC ({gwp_to_use})": f"PFCS ({gwp_to_use})",
         },
         },
-        'category': {
-            'Total national GHG emissions (with LULUCF)': '0',
-            'Total national GHG emissions (without LULUCF)': 'M.0.EL',
-            'International Bunkers': 'M.BK',
-            '1.A.3.a.i': 'M.BK.A',
-            '1.A.3.d.i': 'M.BK.M',
-            'CO2 from Biomass Combustion for Energy Production': 'M.BIO',
-            '6 Other': '6',
-            '2 H': '2.H',
+        "category": {
+            "Total national GHG emissions (with LULUCF)": "0",
+            "Total national GHG emissions (without LULUCF)": "M.0.EL",
+            "International Bunkers": "M.BK",
+            "1.A.3.a.i": "M.BK.A",
+            "1.A.3.d.i": "M.BK.M",
+            "CO2 from Biomass Combustion for Energy Production": "M.BIO",
+            "6 Other": "6",
+            "2 H": "2.H",
         },
         },
     }
     }
 
 
     coords_value_filling = {
     coords_value_filling = {
         "category": {
         "category": {
             "orig_cat_name": {
             "orig_cat_name": {
-                'International Bunkers': 'M.BK',
+                "International Bunkers": "M.BK",
             },
             },
         },
         },
     }
     }
@@ -103,7 +103,8 @@ if __name__ == "__main__":
         "references": "https://unfccc.int/documents/461972",
         "references": "https://unfccc.int/documents/461972",
         "rights": "",
         "rights": "",
         "contact": "mail@johannes-guetschow.de",
         "contact": "mail@johannes-guetschow.de",
-        "title": "Montenegro. Biennial update report (BUR). BUR 3. National inventory report.",
+        "title": "Montenegro. Biennial update report (BUR). "
+        "BUR 3. National inventory report.",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
     }
@@ -111,7 +112,11 @@ if __name__ == "__main__":
     # ###
     # ###
     # Read all time series table from pdf
     # Read all time series table from pdf
     # ###
     # ###
-    tables = camelot.read_pdf(str(input_folder / inventory_file_pdf), pages=','.join([str(page) for page in pages_to_read]), flavor='lattice')
+    tables = camelot.read_pdf(
+        str(input_folder / inventory_file_pdf),
+        pages=",".join([str(page) for page in pages_to_read]),
+        flavor="lattice",
+    )
 
 
     # ###
     # ###
     # process tables and combine them using the pm2 pr.merge function
     # process tables and combine them using the pm2 pr.merge function
@@ -142,11 +147,14 @@ if __name__ == "__main__":
 
 
         # remove ',' in numbers
         # remove ',' in numbers
         years = df_current_table.columns[2:]
         years = df_current_table.columns[2:]
-        def repl(m):
+
+        def repl(m):  # noqa: D103
             return m.group("part1") + m.group("part2")
             return m.group("part1") + m.group("part2")
+
         for year in years:
         for year in years:
             df_current_table.loc[:, year] = df_current_table.loc[:, year].str.replace(
             df_current_table.loc[:, year] = df_current_table.loc[:, year].str.replace(
-                '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
+                "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+            )
 
 
         # add entity and unit cols
         # add entity and unit cols
         df_current_table["entity"] = entity
         df_current_table["entity"] = entity
@@ -156,13 +164,15 @@ if __name__ == "__main__":
             to_drop = drop_data[i]
             to_drop = drop_data[i]
             if "cats" in to_drop.keys():
             if "cats" in to_drop.keys():
                 mask = df_current_table["category"].isin(to_drop["cats"])
                 mask = df_current_table["category"].isin(to_drop["cats"])
-                df_current_table = df_current_table.drop(df_current_table[mask].index,
-                                                         axis=0)
+                df_current_table = df_current_table.drop(
+                    df_current_table[mask].index, axis=0
+                )
             if "years" in to_drop.keys():
             if "years" in to_drop.keys():
                 df_current_table = df_current_table.drop(columns=to_drop["years"])
                 df_current_table = df_current_table.drop(columns=to_drop["years"])
 
 
         df_current_table["category"] = df_current_table["category"].fillna(
         df_current_table["category"] = df_current_table["category"].fillna(
-            value=df_current_table["orig_cat_name"])
+            value=df_current_table["orig_cat_name"]
+        )
 
 
         df_current_table = df_current_table.drop(columns="orig_cat_name")
         df_current_table = df_current_table.drop(columns="orig_cat_name")
 
 
@@ -191,7 +201,7 @@ if __name__ == "__main__":
     # ###
     # ###
 
 
     # convert to mass units from CO2eq
     # convert to mass units from CO2eq
-    entities_to_convert = ['N2O', 'SF6', 'CH4']
+    entities_to_convert = ["N2O", "SF6", "CH4"]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
 
 
     # for entity in entities_to_convert:
     # for entity in entities_to_convert:
@@ -215,21 +225,28 @@ if __name__ == "__main__":
 
 
     # map categories
     # map categories
     data_if_2006 = data_if_2006.replace(
     data_if_2006 = data_if_2006.replace(
-        {f"category ({coords_terminologies['category']})": cat_mapping})
+        {f"category ({coords_terminologies['category']})": cat_mapping}
+    )
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
 
 
     # rename the category col
     # rename the category col
-    data_if_2006.rename(columns={
-        f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
-                        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+        }
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == f"category ({coords_terminologies['category']})"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
     # aggregate categories
     # aggregate categories
     for cat_to_agg in aggregate_cats:
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
         # print(df_test)
         # print(df_test)
 
 
@@ -237,10 +254,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -248,8 +265,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -257,7 +281,7 @@ if __name__ == "__main__":
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
-            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join="outer")
             data_if_2006 = data_if_2006.reset_index(drop=True)
             data_if_2006 = data_if_2006.reset_index(drop=True)
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
@@ -268,7 +292,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
 
-
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
     # ###
     # ###
@@ -276,13 +299,22 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
 
 
     # data in original categories
     # data in original categories
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_all.data_vars}
     encoding = {var: compression for var in data_all.data_vars}
-    data_all.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_all.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     # data in 2006 categories
     # data in 2006 categories
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Morocco/__init__.py

@@ -0,0 +1,30 @@
+"""Read Morocco's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MAR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MAR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 187 - 108
src/unfccc_ghg_data/unfccc_reader/Morocco/config_mar_bur3.py

@@ -1,57 +1,98 @@
+"""Config for Morocco's BUR3
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 # define which raw tables to combine
 # define which raw tables to combine
 table_defs = {
 table_defs = {
     2010: {
     2010: {
-        'Energy': [0, 1],
-        'Agriculture': [10],
-        'IPPU': [15, 16, 17],
-        'LULUCF': [30],
-        'Waste': [35],
+        "Energy": [0, 1],
+        "Agriculture": [10],
+        "IPPU": [15, 16, 17],
+        "LULUCF": [30],
+        "Waste": [35],
     },
     },
     2012: {
     2012: {
-        'Energy': [2, 3],
-        'Agriculture': [11],
-        'IPPU': [18, 19, 20],
-        'LULUCF': [31],
-        'Waste': [36],
+        "Energy": [2, 3],
+        "Agriculture": [11],
+        "IPPU": [18, 19, 20],
+        "LULUCF": [31],
+        "Waste": [36],
     },
     },
     2014: {
     2014: {
-        'Energy': [4, 5],
-        'Agriculture': [10],
-        'IPPU': [21, 22, 23],
-        'LULUCF': [32],
-        'Waste': [37],
+        "Energy": [4, 5],
+        "Agriculture": [10],
+        "IPPU": [21, 22, 23],
+        "LULUCF": [32],
+        "Waste": [37],
     },
     },
     2016: {
     2016: {
-        'Energy': [6, 7],
-        'Agriculture': [10],
-        'IPPU': [24, 25, 26],
-        'LULUCF': [33],
-        'Waste': [38],
+        "Energy": [6, 7],
+        "Agriculture": [10],
+        "IPPU": [24, 25, 26],
+        "LULUCF": [33],
+        "Waste": [38],
     },
     },
     2018: {
     2018: {
-        'Energy': [8, 9],
-        'Agriculture': [14],
-        'IPPU': [27, 28, 29],
-        'LULUCF': [34],
-        'Waste': [39],
+        "Energy": [8, 9],
+        "Agriculture": [14],
+        "IPPU": [27, 28, 29],
+        "LULUCF": [34],
+        "Waste": [39],
     },
     },
 }
 }
 
 
 header_defs = {
 header_defs = {
-    'Energy': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Agriculture': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']], # units are wrong
+    "Energy": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
+    ],
+    "Agriculture": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "Gg", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],  # units are wrong
     # in BUR pdf
     # in BUR pdf
-    'IPPU': [['Catégories', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'LULUCF': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Waste': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
+    "IPPU": [
+        [
+            "Catégories",
+            "CO2",
+            "CH4",
+            "N2O",
+            "HFCs",
+            "PFCs",
+            "SF6",
+            "NOx",
+            "CO",
+            "COVNM",
+            "SO2",
+        ],
+        [
+            "",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+        ],
+    ],
+    "LULUCF": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],
+    "Waste": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],
 }
 }
 
 
-remove_cats = ['3.A.4', '3.B', '3.B.4', '1.B.2.a', '1.B.2.b', '1.B.2.c']
+remove_cats = ["3.A.4", "3.B", "3.B.4", "1.B.2.a", "1.B.2.b", "1.B.2.c"]
 
 
 cat_mapping = {
 cat_mapping = {
     "1.B.2.a.4": "1.B.2.a.iii.4",
     "1.B.2.a.4": "1.B.2.a.iii.4",
@@ -61,81 +102,119 @@ cat_mapping = {
     "1.B.2.b.4": "1.B.2.b.iii.4",
     "1.B.2.b.4": "1.B.2.b.iii.4",
     "1.B.2.b.5": "1.B.2.b.iii.5",
     "1.B.2.b.5": "1.B.2.b.iii.5",
     "1.B.2.b.6": "1.B.2.b.iii.6",
     "1.B.2.b.6": "1.B.2.b.iii.6",
-    "1.B.2.c.1": "1.B.2.b.i", # simplification, split to oil and gas ("1.B.2.X.i")
-    "1.B.2.c.2": "1.B.2.b.ii", # simplification, split to oil and gas ("1.B.2.X.ii")
-    '1.A.2.g': '1.A.2.m', # other industry
-    '3.A': '3.A.1', # enteric fermentation
-    '3.A.1': '3.A.1.a', # cattle
-    '3.A.1.a': '3.A.1.a.i',
-    '3.A.1.b': '3.A.1.a.ii',
-    '3.A.2': '3.A.1.c',
-    '3.A.3': '3.A.1.h', # Swine
-    '3.A.4.a': '3.A.1.d', # goats
-    '3.A.4.b': '3.A.1.e', # camels
-    '3.A.4.c': '3.A.1.f', # horses
-    '3.A.4.d': '3.A.1.g', # Mules and asses
-    '3.A.4.e': '3.A.1.i', # poultry
-#    '3.B': '3.A.2', # Manure Management
-    '3.B.1': '3.A.2.a', # cattle
-    '3.B.1.a': '3.A.2.a.i',
-    '3.B.1.b': '3.A.2.a.ii',
-    '3.B.2': '3.A.2.c', # Sheep
-    '3.B.3': '3.A.2.h', # Swine
-    '3.B.4.a': '3.A.2.d', # Goats
-    '3.B.4.b': '3.A.2.e', # Camels
-    '3.B.4.c': '3.A.2.f', # Horses
-    '3.B.4.d': '3.A.2.g', # Mules and Asses
-    '3.B.4.e': '3.A.2.i', # Poultry
-    '3.B.5': '3.C.6', # indirect N2O from manure management
-    '3.C': '3.C.7', # rice
-    '3.D': 'M.3.C.45AG', # Agricultural soils
-    '3.D.a': '3.C.4', #direct N2O from agri soils
-    '3.D.a.1': '3.C.4.a', # inorganic fertilizers
-    '3.D.a.2': '3.C.4.b', # organic fertilizers
-    '3.D.a.3': '3.C.4.c', # urine and dung by grazing animals
-    '3.D.a.4': '3.C.4.d', # N in crop residues
-    '3.D.b': '3.C.5', # indirect N2O from managed soils
-    '3.D.b.1': '3.C.5.a', # Atmospheric deposition
-    '3.D.b.2': '3.C.5.b', # nitrogen leeching and runoff
-    '3.H': '3.C.3', # urea application
-    'LU.3.B.1': '3.B.1', # forest
-    'LU.3.B.2': '3.B.2', # cropland
-    'LU.3.B.3': '3.B.3', # grassland
-    'LU.3.B.4': '3.B.4', # wetland
-    'LU.3.B.5': '3.B.5', # Settlements
-    'LU.3.B.6': '3.B.6', # other land
+    "1.B.2.c.1": "1.B.2.b.i",  # simplification, split to oil and gas ("1.B.2.X.i")
+    "1.B.2.c.2": "1.B.2.b.ii",  # simplification, split to oil and gas ("1.B.2.X.ii")
+    "1.A.2.g": "1.A.2.m",  # other industry
+    "3.A": "3.A.1",  # enteric fermentation
+    "3.A.1": "3.A.1.a",  # cattle
+    "3.A.1.a": "3.A.1.a.i",
+    "3.A.1.b": "3.A.1.a.ii",
+    "3.A.2": "3.A.1.c",
+    "3.A.3": "3.A.1.h",  # Swine
+    "3.A.4.a": "3.A.1.d",  # goats
+    "3.A.4.b": "3.A.1.e",  # camels
+    "3.A.4.c": "3.A.1.f",  # horses
+    "3.A.4.d": "3.A.1.g",  # Mules and asses
+    "3.A.4.e": "3.A.1.i",  # poultry
+    #    '3.B': '3.A.2', # Manure Management
+    "3.B.1": "3.A.2.a",  # cattle
+    "3.B.1.a": "3.A.2.a.i",
+    "3.B.1.b": "3.A.2.a.ii",
+    "3.B.2": "3.A.2.c",  # Sheep
+    "3.B.3": "3.A.2.h",  # Swine
+    "3.B.4.a": "3.A.2.d",  # Goats
+    "3.B.4.b": "3.A.2.e",  # Camels
+    "3.B.4.c": "3.A.2.f",  # Horses
+    "3.B.4.d": "3.A.2.g",  # Mules and Asses
+    "3.B.4.e": "3.A.2.i",  # Poultry
+    "3.B.5": "3.C.6",  # indirect N2O from manure management
+    "3.C": "3.C.7",  # rice
+    "3.D": "M.3.C.45AG",  # Agricultural soils
+    "3.D.a": "3.C.4",  # direct N2O from agri soils
+    "3.D.a.1": "3.C.4.a",  # inorganic fertilizers
+    "3.D.a.2": "3.C.4.b",  # organic fertilizers
+    "3.D.a.3": "3.C.4.c",  # urine and dung by grazing animals
+    "3.D.a.4": "3.C.4.d",  # N in crop residues
+    "3.D.b": "3.C.5",  # indirect N2O from managed soils
+    "3.D.b.1": "3.C.5.a",  # Atmospheric deposition
+    "3.D.b.2": "3.C.5.b",  # nitrogen leeching and runoff
+    "3.H": "3.C.3",  # urea application
+    "LU.3.B.1": "3.B.1",  # forest
+    "LU.3.B.2": "3.B.2",  # cropland
+    "LU.3.B.3": "3.B.3",  # grassland
+    "LU.3.B.4": "3.B.4",  # wetland
+    "LU.3.B.5": "3.B.5",  # Settlements
+    "LU.3.B.6": "3.B.6",  # other land
 }
 }
 
 
 aggregate_cats = {
 aggregate_cats = {
-    '1.B.2.a.iii': {'sources': ['1.B.2.a.iii.4', '1.B.2.a.iii.5', '1.B.2.a.iii.6'],
-                    'name': 'All Other'},
-    '1.B.2.b.iii': {'sources': ['1.B.2.b.iii.2', '1.B.2.b.iii.4', '1.B.2.b.iii.5',
-                                '1.B.2.b.iii.6',],
-                    'name': 'All Other'},
-    '1.B.2.a': {'sources': ['1.B.2.a.iii'], 'name': 'Oil'},
-    '1.B.2.b': {'sources': ['1.B.2.b.i', '1.B.2.b.ii', '1.B.2.b.iii'],
-                'name': 'Natural Gas'},
-    '2.D':  {'sources': ['2.D.4'], 'name': 'Non-Energy Products from Fuels and Solvent Use'},
-    '2.F.1':  {'sources': ['2.F.1.a', '2.F.1.b'], 'name': 'Refrigeration and Air Conditioning'},
-    '2.F':  {'sources': ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
-             'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-    '2.H':  {'sources': ["2.H.1", "2.H.2", "2.H.3"], 'name': 'Other'},
-    '3.A.2': {'sources': ['3.A.2.a', '3.A.2.c', '3.A.2.d', '3.A.2.e', '3.A.2.f',
-                          '3.A.2.g', '3.A.2.h', '3.A.2.i'],
-              'name': 'Manure Management'},
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    '3.C': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    'M.AG': {'sources': ['3.A', 'M.3.C.AG'], 'name': 'Agriculture'},
-    '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    '4': {'sources': ['4.A', '4.D'], 'name': 'Waste'},
-    '0': {'sources': ['1', '2', '3', '4']},
-    'M.0.EL': {'sources': ['1', '2', 'M.AG', '4']},
+    "1.B.2.a.iii": {
+        "sources": ["1.B.2.a.iii.4", "1.B.2.a.iii.5", "1.B.2.a.iii.6"],
+        "name": "All Other",
+    },
+    "1.B.2.b.iii": {
+        "sources": [
+            "1.B.2.b.iii.2",
+            "1.B.2.b.iii.4",
+            "1.B.2.b.iii.5",
+            "1.B.2.b.iii.6",
+        ],
+        "name": "All Other",
+    },
+    "1.B.2.a": {"sources": ["1.B.2.a.iii"], "name": "Oil"},
+    "1.B.2.b": {
+        "sources": ["1.B.2.b.i", "1.B.2.b.ii", "1.B.2.b.iii"],
+        "name": "Natural Gas",
+    },
+    "2.D": {
+        "sources": ["2.D.4"],
+        "name": "Non-Energy Products from Fuels and Solvent Use",
+    },
+    "2.F.1": {
+        "sources": ["2.F.1.a", "2.F.1.b"],
+        "name": "Refrigeration and Air Conditioning",
+    },
+    "2.F": {
+        "sources": ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
+        "name": "Product uses as Substitutes for Ozone Depleting Substances",
+    },
+    "2.H": {"sources": ["2.H.1", "2.H.2", "2.H.3"], "name": "Other"},
+    "3.A.2": {
+        "sources": [
+            "3.A.2.a",
+            "3.A.2.c",
+            "3.A.2.d",
+            "3.A.2.e",
+            "3.A.2.f",
+            "3.A.2.g",
+            "3.A.2.h",
+            "3.A.2.i",
+        ],
+        "name": "Manure Management",
+    },
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "3.C": {
+        "sources": ["3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "M.AG": {"sources": ["3.A", "M.3.C.AG"], "name": "Agriculture"},
+    "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "4": {"sources": ["4.A", "4.D"], "name": "Waste"},
+    "0": {"sources": ["1", "2", "3", "4"]},
+    "M.0.EL": {"sources": ["1", "2", "M.AG", "4"]},
 }
 }
 
 
-zero_cats = ['1.B.2.a.i', '1.B.2.a.ii'] # venting and flaring with 0 for oil as
+zero_cats = ["1.B.2.a.i", "1.B.2.a.ii"]  # venting and flaring with 0 for oil as
 # all mapped to natural gas
 # all mapped to natural gas

+ 122 - 88
src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -1,13 +1,23 @@
-# this script reads data from Morocco's BUR3
-# Data is read from pdf
+"""
+Read Morocco's BUR3 from pdf
 
 
+This script reads data from Morocco's BUR3
+Data are read from pdf using camelot
+
+"""
 import copy
 import copy
 
 
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_mar_bur3 import (aggregate_cats, cat_mapping, header_defs, remove_cats,
-                              table_defs, zero_cats)
+from config_mar_bur3 import (
+    aggregate_cats,
+    cat_mapping,
+    header_defs,
+    remove_cats,
+    table_defs,
+    zero_cats,
+)
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -16,11 +26,11 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
-    output_filename = 'MAR_BUR3_2022_'
-    inventory_file = 'Morocco_BUR3_Fr.pdf'
-    gwp_to_use = 'AR4GWP100'
+    input_folder = downloaded_data_path / "UNFCCC" / "Morocco" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Morocco"
+    output_filename = "MAR_BUR3_2022_"
+    inventory_file = "Morocco_BUR3_Fr.pdf"
+    gwp_to_use = "AR4GWP100"
 
 
     # years to read
     # years to read
     years = [2010, 2012, 2014, 2016, 2018]
     years = [2010, 2012, 2014, 2016, 2018]
@@ -31,30 +41,28 @@ if __name__ == "__main__":
     # special header as category code and name in one column
     # special header as category code and name in one column
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
 
-    index_cols = ['Catégories']
+    index_cols = ["Catégories"]
 
 
     # rows to remove
     # rows to remove
-    cats_remove = [
-        'Agriculture' # always empty
-    ]
+    cats_remove = ["Agriculture"]  # always empty
 
 
     # manual category codes
     # manual category codes
     cat_codes_manual = {
     cat_codes_manual = {
-        '1.A.2.e -Industries agro-alimentaires et du tabac': '1.A.2.e',
-        '1.A.2.f -Industries des minéraux non- métalliques': '1.A.2.f',
+        "1.A.2.e -Industries agro-alimentaires et du tabac": "1.A.2.e",
+        "1.A.2.f -Industries des minéraux non- métalliques": "1.A.2.f",
         #'Agriculture': 'M.AG',
         #'Agriculture': 'M.AG',
-        '2. PIUP': '2',
-        'UTCATF': 'M.LULUCF',
-        '3.B.1 Terres forestières': 'LU.3.B.1',
-        '3.B.2 Terres cultivées': 'LU.3.B.2',
-        '3.B.3 Prairies': 'LU.3.B.3',
-        '3.B.4 Terres humides': 'LU.3.B.4',
-        '3.B.5 Etablissements': 'LU.3.B.5',
-        '3.B.6 Autres terres': 'LU.3.B.6',
-        '1.B.1.a.i.1 -Exploitation minière': '1.A.1.a.i.1',
+        "2. PIUP": "2",
+        "UTCATF": "M.LULUCF",
+        "3.B.1 Terres forestières": "LU.3.B.1",
+        "3.B.2 Terres cultivées": "LU.3.B.2",
+        "3.B.3 Prairies": "LU.3.B.3",
+        "3.B.4 Terres humides": "LU.3.B.4",
+        "3.B.5 Etablissements": "LU.3.B.5",
+        "3.B.6 Autres terres": "LU.3.B.6",
+        "1.B.1.a.i.1 -Exploitation minière": "1.A.1.a.i.1",
     }
     }
 
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*"
 
 
     coords_terminologies = {
     coords_terminologies = {
         "area": "ISO3",
         "area": "ISO3",
@@ -66,32 +74,29 @@ if __name__ == "__main__":
         "source": "MAR-GHG-inventory ",
         "source": "MAR-GHG-inventory ",
         "provenance": "measured",
         "provenance": "measured",
         "area": "MAR",
         "area": "MAR",
-        "scenario": "BUR3"
+        "scenario": "BUR3",
     }
     }
 
 
     coords_value_mapping = {
     coords_value_mapping = {
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "entity": {
         "entity": {
-            'HFCs (AR4GWP100)': 'HFCS (AR4GWP100)',
-            'PFCs (AR4GWP100)': 'PFCS (AR4GWP100)',
-            'COVNM': 'NMVOC',
-        }
+            "HFCs (AR4GWP100)": "HFCS (AR4GWP100)",
+            "PFCs (AR4GWP100)": "PFCS (AR4GWP100)",
+            "COVNM": "NMVOC",
+        },
     }
     }
 
 
+    coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
 
 
-    coords_cols = {
-        "category": "category",
-        "entity": "entity",
-        "unit": "unit"
-    }
-
-    #add_coords_cols = {
+    # add_coords_cols = {
     #    "orig_cat_name": ["orig_cat_name", "category"],
     #    "orig_cat_name": ["orig_cat_name", "category"],
-    #}
+    # }
 
 
     filter_remove = {
     filter_remove = {
         "f1": {
         "f1": {
-            "entity": ['Other halogenated gases without CO2 equivalent conversion factors (2)'],
+            "entity": [
+                "Other halogenated gases without CO2 equivalent conversion factors (2)"
+            ],
         },
         },
     }
     }
 
 
@@ -107,8 +112,9 @@ if __name__ == "__main__":
     ##### read the raw data from pdf #####
     ##### read the raw data from pdf #####
     tables = camelot.read_pdf(
     tables = camelot.read_pdf(
         str(input_folder / inventory_file),
         str(input_folder / inventory_file),
-        pages=','.join([str(page) for page in pages_to_read]),
-        flavor='lattice')
+        pages=",".join([str(page) for page in pages_to_read]),
+        flavor="lattice",
+    )
 
 
     ##### combine tables and convert to long format #####
     ##### combine tables and convert to long format #####
     df_all = None
     df_all = None
@@ -120,8 +126,9 @@ if __name__ == "__main__":
             df_first = tables[sector_tables[0]].df
             df_first = tables[sector_tables[0]].df
             if len(sector_tables) > 1:
             if len(sector_tables) > 1:
                 for table in sector_tables[1:]:
                 for table in sector_tables[1:]:
-                    df_this_table = pd.concat([df_first, tables[table].df], axis=0,
-                                              join='outer')
+                    df_this_table = pd.concat(
+                        [df_first, tables[table].df], axis=0, join="outer"
+                    )
             else:
             else:
                 df_this_table = df_first
                 df_this_table = df_first
 
 
@@ -130,11 +137,11 @@ if __name__ == "__main__":
             df_this_table.columns = header_defs[sector]
             df_this_table.columns = header_defs[sector]
 
 
             # fix 2018 agri table
             # fix 2018 agri table
-            if (year == 2018) & (sector == "Agriculture"):
+            if (year == 2018) & (sector == "Agriculture"):  # noqa: PLR2004
                 last_shift_row = 25
                 last_shift_row = 25
-                df_temp = df_this_table.iloc[0: last_shift_row, 1:].copy()
-                df_this_table.iloc[0, 1:] = ''
-                df_this_table.iloc[1: last_shift_row + 1, 1:] = df_temp
+                df_temp = df_this_table.iloc[0:last_shift_row, 1:].copy()
+                df_this_table.iloc[0, 1:] = ""
+                df_this_table.iloc[1 : last_shift_row + 1, 1:] = df_temp
 
 
             # replace line breaks, long hyphens, double, and triple spaces in category names
             # replace line breaks, long hyphens, double, and triple spaces in category names
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
@@ -144,14 +151,15 @@ if __name__ == "__main__":
 
 
             # set index and convert to long format
             # set index and convert to long format
             df_this_table = df_this_table.set_index(index_cols)
             df_this_table = df_this_table.set_index(index_cols)
-            df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                                  header_long)
+            df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+                df_this_table, year, header_long
+            )
 
 
             # print(df_this_table_long.head())
             # print(df_this_table_long.head())
             if df_all is None:
             if df_all is None:
                 df_all = df_this_table_long
                 df_all = df_this_table_long
             else:
             else:
-                df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+                df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
 
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
 
 
@@ -166,24 +174,32 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
 
 
     # prepare numbers for pd.to_numeric
     # prepare numbers for pd.to_numeric
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ', '')
-    def repl(m):
-        return m.group('part1') + '.' + m.group('part2')
-    df_all.loc[:, 'data'] = df_all.loc[:, 'data'].str.replace(
-        '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-    df_all['data'][df_all['data'].isnull()] = 'NaN'
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(" ", "")
+
+    def repl(m):  # noqa: D103
+        return m.group("part1") + "." + m.group("part2")
+
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(
+        "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+    )
+    df_all["data"][df_all["data"].isna()] = "NaN"
 
 
     # add GWP information to entity
     # add GWP information to entity
     for entity in df_all["entity"].unique():
     for entity in df_all["entity"].unique():
-        df_all["entity"][(df_all["entity"] == entity) & (
-                    df_all["unit"] == "GgCO2eq")] = f"{entity} ({gwp_to_use})"
+        df_all["entity"][
+            (df_all["entity"] == entity) & (df_all["unit"] == "GgCO2eq")
+        ] = f"{entity} ({gwp_to_use})"
 
 
     # drop "original_cat_name" as it has non-unique values per category
     # drop "original_cat_name" as it has non-unique values per category
     df_all = df_all.drop(columns="orig_cat_name")
     df_all = df_all.drop(columns="orig_cat_name")
@@ -196,7 +212,8 @@ if __name__ == "__main__":
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         meta_data=meta_data,
         meta_data=meta_data,
-        convert_str=True
+        convert_str=True,
+        time_format="%Y",
     )
     )
 
 
     # make sure all col headers are str
     # make sure all col headers are str
@@ -205,7 +222,9 @@ if __name__ == "__main__":
     # conversion to PRIMAP2 native format
     # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
 
-    entities_to_convert = ['CO2'] #['N2O', 'SF6', 'CO2', 'CH4'] # CO2 is not converted on
+    entities_to_convert = [
+        "CO2"
+    ]  # ['N2O', 'SF6', 'CO2', 'CH4'] # CO2 is not converted on
     # conversion to IF as data with and without GWP exists. needs to be fixed in primap2
     # conversion to IF as data with and without GWP exists. needs to be fixed in primap2
     entities_to_convert = [f"{entity} (AR4GWP100)" for entity in entities_to_convert]
     entities_to_convert = [f"{entity} (AR4GWP100)" for entity in entities_to_convert]
 
 
@@ -230,38 +249,42 @@ if __name__ == "__main__":
     data_if_2006.attrs = copy.deepcopy(data_if.attrs)
     data_if_2006.attrs = copy.deepcopy(data_if.attrs)
 
 
     filter_remove_cats = {
     filter_remove_cats = {
-        "cat": {
-            f"category ({coords_terminologies['category']})":
-        remove_cats
-        },
+        "cat": {f"category ({coords_terminologies['category']})": remove_cats},
     }
     }
 
 
     filter_data(data_if_2006, filter_remove=filter_remove_cats)
     filter_data(data_if_2006, filter_remove=filter_remove_cats)
 
 
     # map categories
     # map categories
     data_if_2006 = data_if_2006.replace(
     data_if_2006 = data_if_2006.replace(
-        {f"category ({coords_terminologies['category']})": cat_mapping})
+        {f"category ({coords_terminologies['category']})": cat_mapping}
+    )
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
 
 
     # rename the category col
     # rename the category col
-    data_if_2006.rename(columns={
-        f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
-                        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+        }
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == f"category ({coords_terminologies['category']})"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
     # aggregate categories
     # aggregate categories
-    time_format = '%Y'
+    time_format = "%Y"
     time_columns = [
     time_columns = [
         col
         col
-        for col in data_if_2006.columns.values
+        for col in data_if_2006.columns.to_numpy()
         if matches_time_format(col, time_format)
         if matches_time_format(col, time_format)
     ]
     ]
 
 
     for cat_to_agg in aggregate_cats:
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
         # print(df_test)
         # print(df_test)
 
 
@@ -273,8 +296,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -282,15 +312,16 @@ if __name__ == "__main__":
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
-            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join="outer")
             data_if_2006 = data_if_2006.reset_index(drop=True)
             data_if_2006 = data_if_2006.reset_index(drop=True)
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
     for cat in zero_cats:
     for cat in zero_cats:
         entities = data_if_2006["entity"].unique()
         entities = data_if_2006["entity"].unique()
-        data_zero = data_if_2006[data_if_2006["category (IPCC2006_PRIMAP)"]=="1"].copy(
-            deep=True)
+        data_zero = data_if_2006[
+            data_if_2006["category (IPCC2006_PRIMAP)"] == "1"
+        ].copy(deep=True)
         data_zero["category (IPCC2006_PRIMAP)"] = cat
         data_zero["category (IPCC2006_PRIMAP)"] = cat
         for col in time_columns:
         for col in time_columns:
             data_zero[col] = 0
             data_zero[col] = 0
@@ -303,7 +334,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
 
-
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
     # ###
     # ###
@@ -312,17 +342,21 @@ if __name__ == "__main__":
 
 
     # data in original categories
     # data in original categories
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )
 
 
     # data in 2006 categories
     # data in 2006 categories
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     data_pm2_2006.pr.to_netcdf(
     data_pm2_2006.pr.to_netcdf(
-        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)
+        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Nigeria/__init__.py

@@ -0,0 +1,30 @@
+"""Read Nigeria's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'NGA'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=NGA
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 294 - 272
src/unfccc_ghg_data/unfccc_reader/Nigeria/config_nga_bur2.py

@@ -1,274 +1,280 @@
-gwp_to_use = 'AR5GWP100'
+"""Config for Nigeria's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
+gwp_to_use = "AR5GWP100"
 
 
 tables_trends = {
 tables_trends = {
-    '70': { # GHG by main sector
-        'page': '70',
-        'area': ['177,430,450,142'],
-        'cols': ['208,260,311,355,406'],
-        'coords_defaults': {
-            'unit': 'GgCO2eq',
-        },
-        'coords_cols': {
+    "70": {  # GHG by main sector
+        "page": "70",
+        "area": ["177,430,450,142"],
+        "cols": ["208,260,311,355,406"],
+        "coords_defaults": {
+            "unit": "GgCO2eq",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
             "entity": "entity",
             "entity": "entity",
         },
         },
-        'copy_cols': {
+        "copy_cols": {
             # to: from
             # to: from
-            'entity': 'Year',
+            "entity": "Year",
         },
         },
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
             },
-            'entity': {
-                'Total emissions': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Energy': f'KYOTOGHG ({gwp_to_use})',
-                'IPPU': f'KYOTOGHG ({gwp_to_use})',
-                'AFOLU': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Waste': f'KYOTOGHG ({gwp_to_use})',
+            "entity": {
+                "Total emissions": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Energy": f"KYOTOGHG ({gwp_to_use})",
+                "IPPU": f"KYOTOGHG ({gwp_to_use})",
+                "AFOLU": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Waste": f"KYOTOGHG ({gwp_to_use})",
             },
             },
         },
         },
-        'label_rows': [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '71': { # main gases by sector
-    'page': '71',
-        'area': ['82,760,509,454'],
-        'cols': ['124,186,249,326,388,454'],
-        'coords_defaults': {
-            'category': '0',
-            'unit': 'GgCO2eq',
-        },
-        'coords_cols': {
+    "71": {  # main gases by sector
+        "page": "71",
+        "area": ["82,760,509,454"],
+        "cols": ["124,186,249,326,388,454"],
+        "coords_defaults": {
+            "category": "0",
+            "unit": "GgCO2eq",
+        },
+        "coords_cols": {
             "entity": "Year",
             "entity": "Year",
         },
         },
-        'remove_cols': [],
-        'coords_value_mapping': {
+        "remove_cols": [],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'entity': {
-                'Total GHG emissions (CO₂-eq)': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Removals (CO₂) (CO₂-eq)': 'CO2 removals',
-                'Net emissions (CO₂-eq)': f'KYOTOGHG ({gwp_to_use})',
-                'CO₂ (Gg)': 'CO2 emissions',
-                'CH₄ (CO₂-eq)': f'CH4 ({gwp_to_use})',
-                'N₂O (CO₂-eq)': f'N2O ({gwp_to_use})',
+            "entity": {
+                "Total GHG emissions (CO₂-eq)": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Removals (CO₂) (CO₂-eq)": "CO2 removals",
+                "Net emissions (CO₂-eq)": f"KYOTOGHG ({gwp_to_use})",
+                "CO₂ (Gg)": "CO2 emissions",
+                "CH₄ (CO₂-eq)": f"CH4 ({gwp_to_use})",
+                "N₂O (CO₂-eq)": f"N2O ({gwp_to_use})",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2, 3, 4],
+        "label_rows": [0, 1, 2, 3, 4],
     },
     },
-    '72_1': { # CO2 by main sector
-    'page': '72',
-        'area': ['122,760,496,472'],
-        'cols': ['159,212,265,311,355,406,456'],
-        'coords_defaults': {
+    "72_1": {  # CO2 by main sector
+        "page": "72",
+        "area": ["122,760,496,472"],
+        "cols": ["159,212,265,311,355,406,456"],
+        "coords_defaults": {
             #'entity': 'CO2',
             #'entity': 'CO2',
-            'unit': 'Gg',
+            "unit": "Gg",
         },
         },
-        'coords_cols': {
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
-            'entity': 'entity',
+            "entity": "entity",
         },
         },
-        'remove_cols': ['Total emissions'],
-        'copy_cols': {
+        "remove_cols": ["Total emissions"],
+        "copy_cols": {
             # to: from
             # to: from
-            'entity': 'Year',
+            "entity": "Year",
         },
         },
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total net emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU - emissions': '3',
-                'AFOLU - removals': '3',
-                'Waste': '4',
+            "category": {
+                "Total net emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU - emissions": "3",
+                "AFOLU - removals": "3",
+                "Waste": "4",
             },
             },
-            'entity': {
-                'Total net emissions': 'CO2',
-                'Energy': 'CO2',
-                'IPPU': 'CO2',
-                'AFOLU - emissions': 'CO2 emissions',
-                'AFOLU - removals': 'CO2 removals',
-                'Waste': 'CO2',
+            "entity": {
+                "Total net emissions": "CO2",
+                "Energy": "CO2",
+                "IPPU": "CO2",
+                "AFOLU - emissions": "CO2 emissions",
+                "AFOLU - removals": "CO2 removals",
+                "Waste": "CO2",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '72_2': { # CH4 by sector
-    'page': '72',
-        'area': ['133,333,483,41'],
-        'cols': ['172,230,280,333,384,439'],
-        'coords_defaults': {
-            'entity': 'CH4',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "72_2": {  # CH4 by sector
+        "page": "72",
+        "area": ["133,333,483,41"],
+        "cols": ["172,230,280,333,384,439"],
+        "coords_defaults": {
+            "entity": "CH4",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
-        'remove_cols': ['Total (Gg CO₂-eq)'],
-        'coords_value_mapping': {
+        "remove_cols": ["Total (Gg CO₂-eq)"],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU - emissions': '3',
-                'Waste': '4',
+            "category": {
+                "Total": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU - emissions": "3",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '73': { # N2O by sector
-    'page': '73',
-        'area': ['155,666,643,364'],
-        'cols': ['194,265,309,366,419'],
-        'coords_defaults': {
-            'entity': 'N2O',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "73": {  # N2O by sector
+        "page": "73",
+        "area": ["155,666,643,364"],
+        "cols": ["194,265,309,366,419"],
+        "coords_defaults": {
+            "entity": "N2O",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
-        'remove_cols': ['Total emissions (Gg CO₂-eq)'],
-        'coords_value_mapping': {
+        "remove_cols": ["Total emissions (Gg CO₂-eq)"],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total': '0',
-                'Energy': '1',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total": "0",
+                "Energy": "1",
+                "AFOLU": "3",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '74': { # NOx by sector
-    'page': '74',
-        'area': ['148,457,467,166'],
-        'cols': ['190,254,304,359,421'],
-        'coords_defaults': {
-            'entity': 'NOX',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "74": {  # NOx by sector
+        "page": "74",
+        "area": ["148,457,467,166"],
+        "cols": ["190,254,304,359,421"],
+        "coords_defaults": {
+            "entity": "NOX",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
         #'remove_cols': [],
         #'remove_cols': [],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '75': { # CO by sector
-    'page': '75',
-        'area': ['161,763,456,472'],
-        'cols': ['199,256,307,359,410'],
-        'coords_defaults': {
-            'entity': 'CO',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "75": {  # CO by sector
+        "page": "75",
+        "area": ["161,763,456,472"],
+        "cols": ["199,256,307,359,410"],
+        "coords_defaults": {
+            "entity": "CO",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '75_2': { # NMVOC by sector
-    'page': '75',
-        'area': ['177,325,441,50'],
-        'cols': ['219,287,340,395'],
-        'coords_defaults': {
-            'entity': 'NMVOC',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "75_2": {  # NMVOC by sector
+        "page": "75",
+        "area": ["177,325,441,50"],
+        "cols": ["219,287,340,395"],
+        "coords_defaults": {
+            "entity": "NMVOC",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '76_1': { # NMVOC by sector
-    'page': '76',
-        'area': ['175,782,448,675'],
-        'cols': ['216,282,340,390'],
-        'coords_defaults': {
-            'entity': 'NMVOC',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "76_1": {  # NMVOC by sector
+        "page": "76",
+        "area": ["175,782,448,675"],
+        "cols": ["216,282,340,390"],
+        "coords_defaults": {
+            "entity": "NMVOC",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
     },
-    '76_2': { # SO2 by sector
-    'page': '76',
-        'area': ['197,562,421,226'],
-        'cols': ['243,331,381'],
-        'coords_defaults': {
-            'entity': 'SO2',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "76_2": {  # SO2 by sector
+        "page": "76",
+        "area": ["197,562,421,226"],
+        "cols": ["243,331,381"],
+        "coords_defaults": {
+            "entity": "SO2",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
             "category": "Year",
         },
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "Waste": "4",
             },
             },
         },
         },
-        'label_rows':  [0],
+        "label_rows": [0],
     },
     },
 }
 }
 
 
 pages_inventory = {
 pages_inventory = {
-    '78': 1,
-    '79': 0,
-    '80': 0,
-    '81': 0,
-    '82': 0,
+    "78": 1,
+    "79": 0,
+    "80": 0,
+    "81": 0,
+    "82": 0,
 }
 }
 
 
 year_inventory = 2017
 year_inventory = 2017
@@ -279,8 +285,8 @@ unit_row = 0
 ###
 ###
 index_cols = "Categories"
 index_cols = "Categories"
 units_inv = {
 units_inv = {
-    'Emissions (Gg)': 'Gg',
-    'Emissions CO2 Equivalents (Gg)': 'GgCO2eq',
+    "Emissions (Gg)": "Gg",
+    "Emissions CO2 Equivalents (Gg)": "GgCO2eq",
 }
 }
 # special header as category UNFCCC_GHG_data and name in one column
 # special header as category UNFCCC_GHG_data and name in one column
 header_long = ["category", "entity", "unit", "time", "data"]
 header_long = ["category", "entity", "unit", "time", "data"]
@@ -288,11 +294,11 @@ header_long = ["category", "entity", "unit", "time", "data"]
 
 
 # manual category codes
 # manual category codes
 cat_codes_manual = {
 cat_codes_manual = {
-    'Total National Emissions and Removals': '0',
-    'International Bunkers': 'M.BK',
+    "Total National Emissions and Removals": "0",
+    "International Bunkers": "M.BK",
 }
 }
 
 
-cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,9})\s.*'
+cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,9})\s.*"
 
 
 coords_cols = {
 coords_cols = {
     "category": "category",
     "category": "category",
@@ -321,29 +327,24 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
     "entity": {
-        'Net CO2 (1)(2)': 'CO2',
-        'CH4': "CH4",
-        'N2O': "N2O",
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'SF6': f"SF6 ({gwp_to_use})",
+        "Net CO2 (1)(2)": "CO2",
+        "CH4": "CH4",
+        "N2O": "N2O",
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
         #'NOx': 'NOX',
         #'NOx': 'NOX',
-        'CO': 'CO', # no mapping, just added for completeness here
-        'NMVOCs': 'NMVOC',
-        'SO2': 'SO2', # no mapping, just added for completeness here
-        'Other halogenated gases with CO2 eq conversion factors (3)':
-            f"UnspMixOfHFCs ({gwp_to_use})",
+        "CO": "CO",  # no mapping, just added for completeness here
+        "NMVOCs": "NMVOC",
+        "SO2": "SO2",  # no mapping, just added for completeness here
+        "Other halogenated gases with CO2 eq conversion factors (3)": f"UnspMixOfHFCs ({gwp_to_use})",
     },
     },
 }
 }
 
 
 
 
 filter_remove = {
 filter_remove = {
-    'f1': {
-        'entity': ['Other halogenated gases without CO2 eq conversion factors (4)']
-    },
-    'f2': {
-        'category': 'Memo'
-    },
+    "f1": {"entity": ["Other halogenated gases without CO2 eq conversion factors (4)"]},
+    "f2": {"category": "Memo"},
 }
 }
 
 
 filter_keep = {}
 filter_keep = {}
@@ -353,73 +354,90 @@ meta_data = {
     "rights": "",
     "rights": "",
     "contact": "mail@johannes-guestchow.de",
     "contact": "mail@johannes-guestchow.de",
     "title": "Nigeria. Second Biennial Update Report (BUR2) to the United Nations "
     "title": "Nigeria. Second Biennial Update Report (BUR2) to the United Nations "
-             "Framework Convention on Climate Change",
+    "Framework Convention on Climate Change",
     "comment": "Read fom pdf by Johannes Gütschow",
     "comment": "Read fom pdf by Johannes Gütschow",
     "institution": "UNFCCC",
     "institution": "UNFCCC",
 }
 }
 
 
 # convert to mass units where possible
 # convert to mass units where possible
-entities_to_convert_to_mass = [
-    'CH4', 'N2O', 'SF6'
-]
+entities_to_convert_to_mass = ["CH4", "N2O", "SF6"]
 
 
-# CO2 equivalents don't make sense for these substances, so unit has to be Gg instead of Gg CO2 equivalents as indicated in the table
-entities_to_fix_unit = [
-    'NOx', 'CO', 'NMVOCs', 'SO2'
-]
+# CO2 equivalents don't make sense for these substances, so unit has to be Gg instead
+# of Gg CO2 equivalents as indicated in the table
+entities_to_fix_unit = ["NOx", "CO", "NMVOCs", "SO2"]
 
 
 ### processing
 ### processing
 
 
 processing_info_step1 = {
 processing_info_step1 = {
-    'aggregate_cats': {
-        '2.F': {'sources': ['2.F.2', '2.F.6'], # all 0, but for completeness
-              'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G'],
-              'name': 'IPPU'}, # for HFCs, PFCs, SO2, SF6, N2O (all 0)
+    "aggregate_cats": {
+        "2.F": {
+            "sources": ["2.F.2", "2.F.6"],  # all 0, but for completeness
+            "name": "Product uses as Substitutes for Ozone Depleting Substances",
+        },
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G"],
+            "name": "IPPU",
+        },  # for HFCs, PFCs, SO2, SF6, N2O (all 0)
     },
     },
 }
 }
 
 
-processing_info_step2 =  {
-    'aggregate_cats': {
-        'M.AG.ELV': {'sources': ['3.C'], 'name': 'Agriculture excluding livestock emissions'},
-        'M.AG': {'sources': ['M.AG.ELV', '3.A'], 'name': 'Agriculture'},
-        'M.LULUCF': {'sources': ['3.B', '3.D'],
-                     'name': 'Land Use, Land Use Change, and Forestry'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National Total Excluding LULUCF'},
-        '0': {'sources': ['1', '2', '3', '4', '5'], 'name': 'National Total'},
+processing_info_step2 = {
+    "aggregate_cats": {
+        "M.AG.ELV": {
+            "sources": ["3.C"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.AG": {"sources": ["M.AG.ELV", "3.A"], "name": "Agriculture"},
+        "M.LULUCF": {
+            "sources": ["3.B", "3.D"],
+            "name": "Land Use, Land Use Change, and Forestry",
+        },
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National Total Excluding LULUCF",
+        },
+        "0": {"sources": ["1", "2", "3", "4", "5"], "name": "National Total"},
     },
     },
-    'downscale': {
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+    "downscale": {
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2', '1.B.3'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2", "1.B.3"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            'IPPU': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.E',
-                                    '2.F', '2.G', '2.H'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "IPPU": {
+                "basket": "2",
+                "basket_contents": [
+                    "2.A",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.E",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                ],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3": {
+                "basket": "3",
+                "basket_contents": ["3.A", "3.B", "3.C", "3.D"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
             # '3A': {
             # '3A': {
             #     'basket': '3.A',
             #     'basket': '3.A',
@@ -442,17 +460,21 @@ processing_info_step2 =  {
             # },
             # },
         },
         },
     },
     },
-    'remove_ts': {
-        'fgases': { # unnecessary and complicates aggregation for
+    "remove_ts": {
+        "fgases": {  # unnecessary and complicates aggregation for
             # other gases
             # other gases
-            'category': ['5'],
-            'entities': [f'HFCS ({gwp_to_use})', f'PFCS ({gwp_to_use})', 'SF6',
-                         f'UnspMixOfHFCs ({gwp_to_use})'],
+            "category": ["5"],
+            "entities": [
+                f"HFCS ({gwp_to_use})",
+                f"PFCS ({gwp_to_use})",
+                "SF6",
+                f"UnspMixOfHFCs ({gwp_to_use})",
+            ],
         },
         },
     },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS", "UnspMixOfHFCs"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS", "UnspMixOfHFCs"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }

+ 137 - 103
src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py

@@ -1,5 +1,10 @@
-# this script reads data from Nigeria's BUR2
-# Data is read from the pdf file
+"""
+Read nigeria's BUR2 from pdf
+
+This script reads data from Nigeria's BUR2
+Data are read from pdf using camelot
+
+"""
 
 
 import locale
 import locale
 from copy import deepcopy
 from copy import deepcopy
@@ -9,32 +14,32 @@ import numpy as np
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
 import xarray as xr
 import xarray as xr
-from .config_nga_bur2 import (
-   cat_code_regexp,
-   cat_codes_manual,
-   coords_cols,
-   coords_defaults,
-   coords_terminologies,
-   coords_value_mapping,  #, add_coords_cols
-   entity_row,
-   filter_remove,
-   header_long,
-   index_cols,
-   meta_data,
-   pages_inventory,
-   processing_info_step1,
-   processing_info_step2,
-   tables_trends,
-   unit_row,
-   units_inv,
-   year_inventory,
+from config_nga_bur2 import (
+    cat_code_regexp,
+    cat_codes_manual,
+    coords_cols,
+    coords_defaults,
+    coords_terminologies,
+    coords_value_mapping,  # , add_coords_cols
+    entity_row,
+    filter_remove,
+    header_long,
+    index_cols,
+    meta_data,
+    pages_inventory,
+    processing_info_step1,
+    processing_info_step2,
+    tables_trends,
+    unit_row,
+    units_inv,
+    year_inventory,
 )
 )
 
 
 from unfccc_ghg_data.helper import (
 from unfccc_ghg_data.helper import (
-   downloaded_data_path,
-   extracted_data_path,
-   gas_baskets,
-   process_data_for_country,
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
 )
 )
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
@@ -42,61 +47,74 @@ if __name__ == "__main__":
     # configuration
     # configuration
     # ###
     # ###
     # define locale to use for str to float conversion
     # define locale to use for str to float conversion
-    locale_to_use = 'en_NG.UTF-8'
+    locale_to_use = "en_NG.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
 
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Nigeria' / 'BUR2'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Nigeria'
+    input_folder = downloaded_data_path / "UNFCCC" / "Nigeria" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Nigeria"
     if not output_folder.exists():
     if not output_folder.exists():
-       output_folder.mkdir()
+        output_folder.mkdir()
 
 
-    output_filename = 'NGA_BUR2_2021_'
+    output_filename = "NGA_BUR2_2021_"
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
-    inventory_file = 'NIGERIA_BUR_2_-_Second_Biennial_Update_Report_%28BUR2%29.pdf'
+    inventory_file = "NIGERIA_BUR_2_-_Second_Biennial_Update_Report_%28BUR2%29.pdf"
 
 
     ## read 2019 inventory
     ## read 2019 inventory
     df_inventory = None
     df_inventory = None
     for page in pages_inventory.keys():
     for page in pages_inventory.keys():
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=str(page),
-                                  flavor='lattice')
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=str(page), flavor="lattice"
+        )
         df_this_table = tables[pages_inventory[page]].df
         df_this_table = tables[pages_inventory[page]].df
         # replace line breaks, double, and triple spaces in category names
         # replace line breaks, double, and triple spaces in category names
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         # replace line breaks in units and entities
         # replace line breaks in units and entities
-        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace('\n',
-                                                                                    '')
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace('\n', '')
+        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace(
+            "\n", ""
+        )
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace(
+            "\n", ""
+        )
 
 
         # fillna in unit row
         # fillna in unit row
-        df_this_table.iloc[unit_row][df_this_table.iloc[unit_row]==""] = np.nan
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].fillna(
-            method='ffill')
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_entity=".*",
-                                                           manual_repl_unit=units_inv,
-                                                           default_unit="")
+        df_this_table.iloc[unit_row][df_this_table.iloc[unit_row] == ""] = np.nan
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].ffill()
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_entity=".*",
+            manual_repl_unit=units_inv,
+            default_unit="",
+        )
 
 
         # set index and convert to long format
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year_inventory,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year_inventory, header_long
+        )
 
 
         # combine with tables for other sectors (merge not append)
         # combine with tables for other sectors (merge not append)
         if df_inventory is None:
         if df_inventory is None:
             df_inventory = df_this_table_long
             df_inventory = df_this_table_long
         else:
         else:
-            df_inventory = pd.concat([df_inventory, df_this_table_long], axis=0, join='outer')
+            df_inventory = pd.concat(
+                [df_inventory, df_this_table_long], axis=0, join="outer"
+            )
 
 
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
     df_inventory["category"] = df_inventory["category"].replace(cat_codes_manual)
     df_inventory["category"] = df_inventory["category"].replace(cat_codes_manual)
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-       return m.group('code')
-    df_inventory["category"] = df_inventory["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory["category"] = df_inventory["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_inventory = df_inventory.reset_index(drop=True)
     df_inventory = df_inventory.reset_index(drop=True)
 
 
     # ###
     # ###
@@ -105,15 +123,15 @@ if __name__ == "__main__":
     data_inv_if = pm2.pm2io.convert_long_dataframe_if(
     data_inv_if = pm2.pm2io.convert_long_dataframe_if(
         df_inventory,
         df_inventory,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-        time_format='%Y',
-        )
+        time_format="%Y",
+    )
 
 
     data_inv_pm2 = pm2.pm2io.from_interchange_format(data_inv_if)
     data_inv_pm2 = pm2.pm2io.from_interchange_format(data_inv_if)
 
 
@@ -122,19 +140,21 @@ if __name__ == "__main__":
     for table in tables_trends.keys():
     for table in tables_trends.keys():
         print(table)
         print(table)
         current_table = deepcopy(tables_trends[table])
         current_table = deepcopy(tables_trends[table])
-        tables = camelot.read_pdf(str(input_folder / inventory_file),
-                                  pages=current_table["page"],
-                                  table_areas=current_table["area"],
-                                  columns=current_table["cols"],
-                                  flavor='stream',
-                                  split_text=True)
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file),
+            pages=current_table["page"],
+            table_areas=current_table["area"],
+            columns=current_table["cols"],
+            flavor="stream",
+            split_text=True,
+        )
         df_this_table = tables[0].df
         df_this_table = tables[0].df
 
 
         # merge rows for entity and unit
         # merge rows for entity and unit
         rows_to_merge = df_this_table.iloc[current_table["label_rows"]]
         rows_to_merge = df_this_table.iloc[current_table["label_rows"]]
         indices_to_merge = rows_to_merge.index
         indices_to_merge = rows_to_merge.index
         # join the three rows
         # join the three rows
-        new_row = rows_to_merge.agg(' '.join)
+        new_row = rows_to_merge.agg(" ".join)
         df_this_table.loc[indices_to_merge[0]] = new_row
         df_this_table.loc[indices_to_merge[0]] = new_row
         df_this_table = df_this_table.drop(indices_to_merge)
         df_this_table = df_this_table.drop(indices_to_merge)
         new_row = new_row.str.replace("  ", " ")
         new_row = new_row.str.replace("  ", " ")
@@ -144,7 +164,7 @@ if __name__ == "__main__":
         df_this_table.columns = new_row
         df_this_table.columns = new_row
 
 
         # remove columns not needed
         # remove columns not needed
-        if 'remove_cols' in current_table.keys():
+        if "remove_cols" in current_table.keys():
             df_this_table = df_this_table.drop(columns=current_table["remove_cols"])
             df_this_table = df_this_table.drop(columns=current_table["remove_cols"])
 
 
         df_this_table = df_this_table.set_index("Year")
         df_this_table = df_this_table.set_index("Year")
@@ -155,12 +175,14 @@ if __name__ == "__main__":
         # remove "," (thousand sep) from data
         # remove "," (thousand sep) from data
         for col in df_this_table.columns:
         for col in df_this_table.columns:
             df_this_table.loc[:, col] = df_this_table.loc[:, col].str.strip()
             df_this_table.loc[:, col] = df_this_table.loc[:, col].str.strip()
-            def repl(m):
-               return m.group('part1') + m.group('part2')
-            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
-                '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-            df_this_table[col][df_this_table[col].isnull()] = 'NaN'
 
 
+            def repl(m):  # noqa: D103
+                return m.group("part1") + m.group("part2")
+
+            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
+                "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+            )
+            df_this_table[col][df_this_table[col].isna()] = "NaN"
 
 
         # metadta in forst col instread of index
         # metadta in forst col instread of index
         df_this_table = df_this_table.reset_index()
         df_this_table = df_this_table.reset_index()
@@ -170,7 +192,7 @@ if __name__ == "__main__":
         df_this_table.columns = df_this_table.columns.map(str)
         df_this_table.columns = df_this_table.columns.map(str)
 
 
         # make copy of columns if a column is used twice for metadata
         # make copy of columns if a column is used twice for metadata
-        if 'copy_cols' in current_table.keys():
+        if "copy_cols" in current_table.keys():
             for col in current_table["copy_cols"]:
             for col in current_table["copy_cols"]:
                 df_this_table[col] = df_this_table[current_table["copy_cols"][col]]
                 df_this_table[col] = df_this_table[current_table["copy_cols"][col]]
 
 
@@ -184,7 +206,7 @@ if __name__ == "__main__":
             coords_value_mapping=current_table["coords_value_mapping"],
             coords_value_mapping=current_table["coords_value_mapping"],
             meta_data=meta_data,
             meta_data=meta_data,
             convert_str=True,
             convert_str=True,
-            time_format='%Y',
+            time_format="%Y",
         )
         )
 
 
         data_current_pm2 = pm2.pm2io.from_interchange_format(data_current_if)
         data_current_pm2 = pm2.pm2io.from_interchange_format(data_current_if)
@@ -193,7 +215,7 @@ if __name__ == "__main__":
         else:
         else:
             data_trend_pm2 = data_trend_pm2.pr.merge(data_current_pm2)
             data_trend_pm2 = data_trend_pm2.pr.merge(data_current_pm2)
 
 
-    data_pm2 = data_inv_pm2.pr.merge(data_trend_pm2, tolerance=0.02) # some rounding in
+    data_pm2 = data_inv_pm2.pr.merge(data_trend_pm2, tolerance=0.02)  # some rounding in
     # trends needs higher tolerance
     # trends needs higher tolerance
 
 
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
@@ -205,48 +227,59 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] +
-                         "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     #### processing
     #### processing
     data_proc_pm2 = data_pm2
     data_proc_pm2 = data_pm2
     terminology_proc = coords_terminologies["category"]
     terminology_proc = coords_terminologies["category"]
 
 
     # combine CO2 emissions and removals
     # combine CO2 emissions and removals
-    temp_CO2 = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
+    temp_CO2 = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
 
 
     # create net KYOTOGHG for 0 and 3
     # create net KYOTOGHG for 0 and 3
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] \
-        = xr.full_like(data_proc_pm2["CO2 removals"],
-                       np.nan).pr.quantify(units="Gg CO2 / year")
-
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"].attrs = {"entity": "KYOTOGHG",
-                                                            "gwp_context": "AR5GWP100"}
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] \
-        = data_proc_pm2.pr.gas_basket_contents_sum(
-        basket="KYOTOGHG removals (AR5GWP100)", basket_contents=['CO2 removals'],
-        skipna=True, min_count=1)
-    temp_KYOTOGHG = data_proc_pm2[["KYOTOGHG emissions (AR5GWP100)",
-                                   "KYOTOGHG removals (AR5GWP100)"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["KYOTOGHG (AR5GWP100)"] \
-        = data_proc_pm2["KYOTOGHG (AR5GWP100)"].fillna(temp_KYOTOGHG)
-
+    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] = xr.full_like(
+        data_proc_pm2["CO2 removals"], np.nan
+    ).pr.quantify(units="Gg CO2 / year")
+
+    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"].attrs = {
+        "entity": "KYOTOGHG",
+        "gwp_context": "AR5GWP100",
+    }
+    data_proc_pm2[
+        "KYOTOGHG removals (AR5GWP100)"
+    ] = data_proc_pm2.pr.gas_basket_contents_sum(
+        basket="KYOTOGHG removals (AR5GWP100)",
+        basket_contents=["CO2 removals"],
+        skipna=True,
+        min_count=1,
+    )
+    temp_KYOTOGHG = data_proc_pm2[
+        ["KYOTOGHG emissions (AR5GWP100)", "KYOTOGHG removals (AR5GWP100)"]
+    ].pr.sum(dim="entity", skipna=True, min_count=1)
+    data_proc_pm2["KYOTOGHG (AR5GWP100)"] = data_proc_pm2[
+        "KYOTOGHG (AR5GWP100)"
+    ].fillna(temp_KYOTOGHG)
 
 
     # actual processing
     # actual processing
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals',
-                            'KYOTOGHG emissions (AR5GWP100)',
-                            'KYOTOGHG removals (AR5GWP100)'],
+        entities_to_ignore=[
+            "CO2 emissions",
+            "CO2 removals",
+            "KYOTOGHG emissions (AR5GWP100)",
+            "KYOTOGHG removals (AR5GWP100)",
+        ],
         gas_baskets={},
         gas_baskets={},
         processing_info_country=processing_info_step1,
         processing_info_country=processing_info_step1,
     )
     )
@@ -256,16 +289,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         gas_baskets=gas_baskets,
         processing_info_country=processing_info_step2,
         processing_info_country=processing_info_step2,
-        cat_terminology_out = terminology_proc,
-        #category_conversion = None,
-        #sectors_out = None,
+        cat_terminology_out=terminology_proc,
+        # category_conversion = None,
+        # sectors_out = None,
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
     # TODO: processing info is present twice
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -274,9 +307,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Peru/__init__.py

@@ -0,0 +1,30 @@
+"""Read Peru's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'PER'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=PER
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 77 - 66
src/unfccc_ghg_data/unfccc_reader/Peru/config_per_bur3.py

@@ -1,3 +1,9 @@
+"""Config for Peru's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 table_def_templates = {
 table_def_templates = {
     "300": {  # 300
     "300": {  # 300
         "area": ["69,457,727,78"],
         "area": ["69,457,727,78"],
@@ -486,75 +492,80 @@ meta_data = {
 
 
 ## processing
 ## processing
 cat_conversion = {
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.A.5': '1.A.5',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '2': '2',
-        '2.A': '2.A',
-        '2.B': '2.B',
-        '2.C': '2.C',
-        '2.D': '2.D',
-        '2.E': '2.E',
-        '2.F': '2.F',
-        '2.G': '2.G',
-        '2.H': '2.H',
-        '3': 'M.AG',
-        '3.A': '3.A',
-        '3.A.1': '3.A.1',
-        '3.A.2': '3.A.2',
-        '3.C': '3.C',
-        '3.C.1': '3.C.1',
-        '3.C.2': '3.C.2',
-        '3.C.3': '3.C.3',
-        '3.C.4': '3.C.4',
-        '3.C.5': '3.C.5',
-        '3.C.6': '3.C.6',
-        '3.C.7': '3.C.7',
-        '4': 'M.LULUCF',
-        'M.2006.3.B': '3.B',
-        '4.A': '3.B.1',
-        '4.B': '3.B.2',
-        '4.C': '3.B.3',
-        '4.D': '3.B.4',
-        '4.E': '3.B.5',
-        '4.F': '3.B.6',
-        '4.G': '3.D.1',
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        'M.BK': 'M.BK',
-        'M.BK.A': 'M.BK.A',
-        'M.BK.M': 'M.BM.M',
-        'M.BIO': 'M.BIO',
-    },
-    'aggregate': {
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-              'name': 'IPPU'},
-        'M.3.C.AG': {
-            'sources': ['3.C'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "mapping": {
+        "0": "0",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.A.5": "1.A.5",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "2": "2",
+        "2.A": "2.A",
+        "2.B": "2.B",
+        "2.C": "2.C",
+        "2.D": "2.D",
+        "2.E": "2.E",
+        "2.F": "2.F",
+        "2.G": "2.G",
+        "2.H": "2.H",
+        "3": "M.AG",
+        "3.A": "3.A",
+        "3.A.1": "3.A.1",
+        "3.A.2": "3.A.2",
+        "3.C": "3.C",
+        "3.C.1": "3.C.1",
+        "3.C.2": "3.C.2",
+        "3.C.3": "3.C.3",
+        "3.C.4": "3.C.4",
+        "3.C.5": "3.C.5",
+        "3.C.6": "3.C.6",
+        "3.C.7": "3.C.7",
+        "4": "M.LULUCF",
+        "M.2006.3.B": "3.B",
+        "4.A": "3.B.1",
+        "4.B": "3.B.2",
+        "4.C": "3.B.3",
+        "4.D": "3.B.4",
+        "4.E": "3.B.5",
+        "4.F": "3.B.6",
+        "4.G": "3.D.1",
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "M.BK": "M.BK",
+        "M.BK.A": "M.BK.A",
+        "M.BK.M": "M.BM.M",
+        "M.BIO": "M.BIO",
+    },
+    "aggregate": {
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "name": "IPPU",
+        },
+        "M.3.C.AG": {
+            "sources": ["3.C"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "3.D": {"sources": ["3.D.1"], "name": "Other"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
     },
 }
 }
 
 
 processing_info = {
 processing_info = {
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }

+ 33 - 20
src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py

@@ -1,12 +1,17 @@
-# read Singapore fifth BUR from pdf
+"""
+Read Peru's BUR3 from pdf
 
 
+This script reads data from Peru's BUR3
+Data are read from pdf using camelot
+
+"""
 
 
 import locale
 import locale
 
 
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_per_bur3 import (
+from config_per_bur3 import (
     cat_code_regexp,
     cat_code_regexp,
     cat_codes_manual,
     cat_codes_manual,
     cat_conversion,
     cat_conversion,
@@ -103,20 +108,22 @@ if __name__ == "__main__":
 
 
             # drop cols if necessary
             # drop cols if necessary
             if "drop_cols" in table_defs[page].keys():
             if "drop_cols" in table_defs[page].keys():
-                # print(df_current.columns.values)
+                # print(df_current.columns.to_numpy())
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
             elif "drop_cols" in table_def_templates[table_on_page].keys():
             elif "drop_cols" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
 
 
             # rename category column
             # rename category column
-            df_current.rename(
-                columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
+            df_current = df_current.rename(
+                columns={table_defs[page]["category_col"]: index_cols[0]}
             )
             )
 
 
             # replace double \n
             # replace double \n
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             # replace double and triple spaces
             # replace double and triple spaces
-            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace(
+                "   ", " "
+            )
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
 
             # fix the split rows
             # fix the split rows
@@ -137,7 +144,7 @@ if __name__ == "__main__":
             # set index
             # set index
             # df_current = df_current.set_index(index_cols)
             # df_current = df_current.set_index(index_cols)
             # strip trailing and leading  and remove "^"
             # strip trailing and leading  and remove "^"
-            for col in df_current.columns.values:
+            for col in df_current.columns.to_numpy():
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.replace("^", "")
                 df_current[col] = df_current[col].str.replace("^", "")
 
 
@@ -147,9 +154,9 @@ if __name__ == "__main__":
                 df_this_page = df_current.copy(deep=True)
                 df_this_page = df_current.copy(deep=True)
             else:
             else:
                 # find intersecting cols
                 # find intersecting cols
-                cols_this_page = df_this_page.columns.values
+                cols_this_page = df_this_page.columns.to_numpy()
                 # print(f"cols this page: {cols_this_page}")
                 # print(f"cols this page: {cols_this_page}")
-                cols_current = df_current.columns.values
+                cols_current = df_current.columns.to_numpy()
                 # print(f"cols current: {cols_current}")
                 # print(f"cols current: {cols_current}")
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 # print(f"cols both: {cols_both}")
                 # print(f"cols both: {cols_both}")
@@ -179,7 +186,9 @@ if __name__ == "__main__":
         # drop the rows with memo items etc
         # drop the rows with memo items etc
         for cat in cats_remove:
         for cat in cats_remove:
             df_this_page_long = df_this_page_long.drop(
             df_this_page_long = df_this_page_long.drop(
-                df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
+                df_this_page_long.loc[
+                    df_this_page_long.loc[:, index_cols[0]] == cat
+                ].index
             )
             )
 
 
         # make a copy of the categories row
         # make a copy of the categories row
@@ -187,12 +196,14 @@ if __name__ == "__main__":
 
 
         # replace cat names by codes in col "Categories"
         # replace cat names by codes in col "Categories"
         # first the manual replacements
         # first the manual replacements
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
-            cat_codes_manual
-        )
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].replace(cat_codes_manual)
+
         # then the regex replacements
         # then the regex replacements
-        def repl(m):
+        def repl(m):  # noqa: D103
             return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
             return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
             :, "category"
             :, "category"
         ].str.replace(cat_code_regexp, repl, regex=True)
         ].str.replace(cat_code_regexp, repl, regex=True)
@@ -211,8 +222,10 @@ if __name__ == "__main__":
             ".", ""
             ".", ""
         )
         )
         pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
         pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
-        def repl(m):
+
+        def repl(m):  # noqa: D103
             return f"{m.group('first')}.{m.group('last')}"
             return f"{m.group('first')}.{m.group('last')}"
+
         df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
         df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
             pat, repl, regex=True
             pat, repl, regex=True
         )
         )
@@ -265,12 +278,11 @@ if __name__ == "__main__":
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
         encoding=encoding,
         encoding=encoding,
     )
     )
 
 
-    #### continue here
-
     # ###
     # ###
     # ## process the data
     # ## process the data
     # ###
     # ###
@@ -288,7 +300,7 @@ if __name__ == "__main__":
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
     data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
@@ -305,6 +317,7 @@ if __name__ == "__main__":
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
         encoding=encoding,
         encoding=encoding,
     )
     )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/__init__.py

@@ -0,0 +1,30 @@
+"""Read South Korea's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'KOR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=KOR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 511 - 403
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/config_kor_bur4.py

@@ -1,413 +1,513 @@
+"""Config for South Korea's 2021 and 2022 inventories and BUR4
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 original_names = [
 original_names = [
-    '총배출량',
-    '순배출량',
-    '에너지',
-    'A. 연료연소',
-    '1. 에너지산업',
-    'a. 공공전기 및 열 생산',
-    'b. 석유정제',
-    'c. 고체연료 제조 및 기타 에너지 산업',
-    '2. 제조업 및 건설업',
-    'a. 철강',
-    'b. 비철금속',
-    'c. 화학',
-    'd. 펄프, 제지 및 인쇄',
-    'e. 식음료품 가공 및 담배 제조',
-    'f. 기타',
-    '  1. 비금속',
-    '  2. 조립금속',
-    '  3. 나무 및 목재',
-    '  4. 건설',
-    '  5. 섬유 및 가죽',
-    '  6. 기타제조',
-    '3. 수송',
-    'a. 민간항공',
-    'b. 도로수송',
-    'c. 철도',
-    'd. 해운',
-    'e. 기타수송',
-    '4. 기타',
-    'a. 상업/공공',
-    'b. 가정',
-    'c. 농업/임업/어업',
-    '5. 미분류',
-    'B. 탈루',
-    '1. 고체연료',
-    '2.  석유 및 천연가스',
-    'a.  석유',
-    'b. 천연가스',
-    '산업공정',
-    'A. 광물산업',
-    '1. 시멘트생산',
-    '2. 석회생산',
-    '3. 석회석 및 백운석 소비',
-    '4. 소다회 생산 및 소비',
-    '5. 아스팔트 루핑',
-    '6. 아스팔트 도로포장',
-    'B. 화학산업',
-    'C. 금속산업',
-    '1. 철강생산',
-    '2. 합금철 생산',
-    '3. 알루미늄 생산',
-    '4. 마그네슘 생산의 SF6 소비',
-    'D. 기타산업',
-    'E. 할로카본 및 육불화황 생산',
-    '1. 부산물 배출',
-    '2. 탈루 배출',
-    'F. 할로카본 및 육불화황 소비',
-    '1.  냉장 및 냉방',
-    '2.  발포제',
-    '3.  소화기',
-    '4.  에어로졸',
-    '5.  용매',
-    '6.  기타 용도의 ODS 대체물질 사용',
-    '7.  반도체 제조',
-    '8.  중전기기',
-    '9.  기타(잠재배출량)',
-    '농업',
-    'A.  장내발효',
-    '1. 소',
-    '2. 물소',
-    '3. 양(면양)',
-    '4. 양(산양)',
-    '5. 낙타 및 라마',
-    '6. 말',
-    '7. 노새 및 당나귀',
-    '8. 돼지',
-    '9. 가금류',
-    '10. 기타 가축(사슴)',
-    'B.  가축분뇨처리',
-    '1. 소',
-    '2. 물소',
-    '3. 양(면양)',
-    '4. 양(산양)',
-    '5. 낙타 및 라마',
-    '6. 말',
-    '7. 노새 및 당나귀',
-    '8. 돼지',
-    '9. 가금류',
-    '10. 기타 가축(사슴)',
-    'C.  벼재배',
-    '1. 관개',
-    '2. 천수답',
-    'D. 농경지토양',
-    '1. 직접배출',
-    '2. 목장, 방목구역, 분료(거름)',
-    '3. 간접배출',
-    'E. 사바나 소각',
-    'F. 작물잔사소각',
-    '1. 곡류',
-    '2. 두류(콩)',
-    '3. 근채류',
-    '4. 사탕수수',
-    '5. 기타',
-    'LULUCF',
-    'A. 산림지',
-    '1. 산림지로 유지된 산림지',
-    '2. 타토지에서 전용된 산림지',
-    '3. 산림지에서 질소 시비로 인한 N2O 배출',
-    '4. 산림지에서 배수로 인한 Non-CO2 배출',
-    '5. 산림지에서 바이오매스 연소에 의한 배출',
-    'B. 농경지',
-    '1. 농경지로 유지된 농경지',
-    '2. 타토지에서 전용된 농경지',
-    '3. 농경지로의 전용에 따른 N2O 배출',
-    '4. 농경지에서 농업용 석회시용으로 인한 CO2 배출',
-    '5. 농경지에서 바이오매스 연소에 의한 배출',
-    'C. 초지',
-    '1. 초지로 유지된 초지',
-    '2. 타토지에서 전용된 초지',
-    '3. 초지에서 농업용 석회시용으로 인한 CO2 배출',
-    '4. 초지에서 바이오매스 연소에 의한 배출',
-    'D. 습지',
-    '1. 습지로 유지된 습지',
-    '2. 타토지에서 전용된 습지',
-    '3. 습지에서 배수로 인한 Non-CO2 배출',
-    '4. 습지에서 바이오매스 연소에 의한 배출',
-    'E. 정주지',
-    'F. 기타토지',
-    '폐기물',
-    'A. 폐기물매립',
-    '1. 관리형 매립',
-    '2. 비관리형 매립',
-    'B. 하폐수처리',
-    '1. 폐수처리',
-    '2. 하수처리',
-    'C. 폐기물소각',
-    'D. 기타',
-    '별도항목(Memo Item)',
-    '분야·부문/연도',
-    'C. 국제벙커링 및 다국적 작전',
-    '1. 벙커링',
-    'a. 국제 항공',
-    'b. 국제 해운',
-    '2. 다국적 작전',
-    '* 참고 : NO = 배출활동 및 공정이 없는 경우, NE = 산정하지 아니하는 경우, NA = 자연적, 이론적으로 발생하지 않는 활동 및 공정의 경우, IE = 다른 항목에 포함하여 보고하는 경우, C = 기밀정보인 경우',
-    '3. 타토지로 전용된 농경지', # start of new codes in 2021 inventory
-    '4. 농경지로의 전용에 따른 N2O 배출',
-    '5. 농경지에서 농업용 석회시용으로 인한 CO2 배출',
-    '6. 농경지에서 바이오매스 연소에 의한 배출',
-    'G. 기타',
+    "총배출량",
+    "순배출량",
+    "에너지",
+    "A. 연료연소",
+    "1. 에너지산업",
+    "a. 공공전기 및 열 생산",
+    "b. 석유정제",
+    "c. 고체연료 제조 및 기타 에너지 산업",
+    "2. 제조업 및 건설업",
+    "a. 철강",
+    "b. 비철금속",
+    "c. 화학",
+    "d. 펄프, 제지 및 인쇄",
+    "e. 식음료품 가공 및 담배 제조",
+    "f. 기타",
+    "  1. 비금속",
+    "  2. 조립금속",
+    "  3. 나무 및 목재",
+    "  4. 건설",
+    "  5. 섬유 및 가죽",
+    "  6. 기타제조",
+    "3. 수송",
+    "a. 민간항공",
+    "b. 도로수송",
+    "c. 철도",
+    "d. 해운",
+    "e. 기타수송",
+    "4. 기타",
+    "a. 상업/공공",
+    "b. 가정",
+    "c. 농업/임업/어업",
+    "5. 미분류",
+    "B. 탈루",
+    "1. 고체연료",
+    "2.  석유 및 천연가스",
+    "a.  석유",
+    "b. 천연가스",
+    "산업공정",
+    "A. 광물산업",
+    "1. 시멘트생산",
+    "2. 석회생산",
+    "3. 석회석 및 백운석 소비",
+    "4. 소다회 생산 및 소비",
+    "5. 아스팔트 루핑",
+    "6. 아스팔트 도로포장",
+    "B. 화학산업",
+    "C. 금속산업",
+    "1. 철강생산",
+    "2. 합금철 생산",
+    "3. 알루미늄 생산",
+    "4. 마그네슘 생산의 SF6 소비",
+    "D. 기타산업",
+    "E. 할로카본 및 육불화황 생산",
+    "1. 부산물 배출",
+    "2. 탈루 배출",
+    "F. 할로카본 및 육불화황 소비",
+    "1.  냉장 및 냉방",
+    "2.  발포제",
+    "3.  소화기",
+    "4.  에어로졸",
+    "5.  용매",
+    "6.  기타 용도의 ODS 대체물질 사용",
+    "7.  반도체 제조",
+    "8.  중전기기",
+    "9.  기타(잠재배출량)",
+    "농업",
+    "A.  장내발효",
+    "1. 소",
+    "2. 물소",
+    "3. 양(면양)",
+    "4. 양(산양)",
+    "5. 낙타 및 라마",
+    "6. 말",
+    "7. 노새 및 당나귀",
+    "8. 돼지",
+    "9. 가금류",
+    "10. 기타 가축(사슴)",
+    "B.  가축분뇨처리",
+    "1. 소",
+    "2. 물소",
+    "3. 양(면양)",
+    "4. 양(산양)",
+    "5. 낙타 및 라마",
+    "6. 말",
+    "7. 노새 및 당나귀",
+    "8. 돼지",
+    "9. 가금류",
+    "10. 기타 가축(사슴)",
+    "C.  벼재배",
+    "1. 관개",
+    "2. 천수답",
+    "D. 농경지토양",
+    "1. 직접배출",
+    "2. 목장, 방목구역, 분료(거름)",
+    "3. 간접배출",
+    "E. 사바나 소각",
+    "F. 작물잔사소각",
+    "1. 곡류",
+    "2. 두류(콩)",
+    "3. 근채류",
+    "4. 사탕수수",
+    "5. 기타",
+    "LULUCF",
+    "A. 산림지",
+    "1. 산림지로 유지된 산림지",
+    "2. 타토지에서 전용된 산림지",
+    "3. 산림지에서 질소 시비로 인한 N2O 배출",
+    "4. 산림지에서 배수로 인한 Non-CO2 배출",
+    "5. 산림지에서 바이오매스 연소에 의한 배출",
+    "B. 농경지",
+    "1. 농경지로 유지된 농경지",
+    "2. 타토지에서 전용된 농경지",
+    "3. 농경지로의 전용에 따른 N2O 배출",
+    "4. 농경지에서 농업용 석회시용으로 인한 CO2 배출",
+    "5. 농경지에서 바이오매스 연소에 의한 배출",
+    "C. 초지",
+    "1. 초지로 유지된 초지",
+    "2. 타토지에서 전용된 초지",
+    "3. 초지에서 농업용 석회시용으로 인한 CO2 배출",
+    "4. 초지에서 바이오매스 연소에 의한 배출",
+    "D. 습지",
+    "1. 습지로 유지된 습지",
+    "2. 타토지에서 전용된 습지",
+    "3. 습지에서 배수로 인한 Non-CO2 배출",
+    "4. 습지에서 바이오매스 연소에 의한 배출",
+    "E. 정주지",
+    "F. 기타토지",
+    "폐기물",
+    "A. 폐기물매립",
+    "1. 관리형 매립",
+    "2. 비관리형 매립",
+    "B. 하폐수처리",
+    "1. 폐수처리",
+    "2. 하수처리",
+    "C. 폐기물소각",
+    "D. 기타",
+    "별도항목(Memo Item)",
+    "분야·부문/연도",
+    "C. 국제벙커링 및 다국적 작전",
+    "1. 벙커링",
+    "a. 국제 항공",
+    "b. 국제 해운",
+    "2. 다국적 작전",
+    "* 참고 : NO = 배출활동 및 공정이 없는 경우, NE = 산정하지 아니하는 경우, NA = 자연적, "
+    "이론적으로 발생하지 않는 활동 및 공정의 경우, IE = 다른 항목에 포함하여 보고하는 경우, "
+    "C = 기밀정보인 경우",
+    "3. 타토지로 전용된 농경지",  # start of new codes in 2021 inventory
+    "4. 농경지로의 전용에 따른 N2O 배출",
+    "5. 농경지에서 농업용 석회시용으로 인한 CO2 배출",
+    "6. 농경지에서 바이오매스 연소에 의한 배출",
+    "G. 기타",
 ]
 ]
 translations = [
 translations = [
-    ['Total emissions', 'M.0.EL'],
-    ['Net emissions', '0'],
-    ['energy', '1'],
-    ['A. Fuel combustion', '1.A'],
-    ['1. Energy industry', '1.A.1'],
-    ['a. Public electricity and heat production', '1.A.1.a'],
-    ['b. Oil refining', '1.A.1.b'],
-    ['c. Solid fuel manufacturing and other energy industries', '1.A.1.c'],
-    ['2. Manufacturing and construction', '1.A.2'],
-    ['a. steel', '1.A.2.a'],
-    ['b. Non-ferrous metal', '1.A.2.b'],
-    ['c. chemistry', '1.A.2.c'],
-    ['d. Pulp, paper and printing', '1.A.2.d'],
-    ['e. Food and beverage processing and tobacco manufacturing', '1.A.2.e'],
-    ['f. Etc', '1.A.2.f'],
-    ['  1. Non-metal', '1.A.2.f.1'],
-    ['  2. Assembly metal', '1.A.2.f.2'],
-    ['  3. Wood and timber', '1.A.2.f.3'],
-    ['  4. Construction', '1.A.2.f.4'],
-    ['  5. Textile and leather', '1.A.2.f.5'],
-    ['  6. Other manufacturing', '1.A.2.f.6'],
-    ['3. Transportation', '1.A.3'],
-    ['a. Civil aviation', '1.A.3.a.2'],
-    ['b. Road transport', '1.A.3.b'],
-    ['c. railroad', '1.A.3.c'],
-    ['d. shipping', '1.A.3.d.2'],
-    ['e. Other transport', '1.A.3.e'],
-    ['4. Other', '1.A.4'],
-    ['a. Commercial/Public', '1.A.4.a'],
-    ['b. home', '1.A.4.b'],
-    ['c. Agriculture/Forestry/Fishing', '1.A.4.c'],
-    ['5. Uncategorized', '1.A.5'],
-    ['B. Talu', '1.B'],
-    ['1. Solid fuel', '1.B.1'],
-    ['2. Oil and natural gas', '1.B.2'],
-    ['a. oil', '1.B.2.a'],
-    ['b. Natural gas', '1.B.2.b'],
-    ['Industrial process', '2'],
-    ['A. Mineral industry', '2.A'],
-    ['1. Cement production', '2.A.1'],
-    ['2. Lime production', '2.A.2'],
-    ['3. Limestone and Dolomite Consumption', '2.A.3'],
-    ['4. Soda ash production and consumption', '2.A.4'],
-    ['5. Asphalt roofing', '2.A.5'],
-    ['6. Asphalt road pavement', '2.A.6'],
-    ['B. Chemical industry', '2.B'],
-    ['C. Metal Industry', '2.C'],
-    ['1. Steel production', '2.C.1'],
-    ['2. Ferroalloy production', '2.C.2'],
-    ['3. Aluminum production', '2.C.3'],
-    ['4. SF6 consumption in magnesium production', '2.C.4'],
-    ['D. Other industries', '2.D'],
-    ['E. Production of halocarbons and sulfur hexafluoride', '2.E'],
-    ['1. Emission of by-products', '2.E.1'],
-    ['2. Fugitive discharge', '2.E.2'],
-    ['F. Consumption of halocarbons and sulfur hexafluoride', '2.F'],
-    ['1. Refrigeration and cooling', '2.F.1'],
-    ['2. Foaming agent', '2.F.2'],
-    ['3. Fire extinguisher', '2.F.3'],
-    ['4. Aerosol', '2.F.4'],
-    ['5. Solvent', '2.F.5'],
-    ['6. Use of ODS substitutes for other purposes', '2.F.6'],
-    ['7. Semiconductor manufacturing', '2.F.7'],
-    ['8. Heavy electric machine', '2.F.8'],
-    ['9. Others (potential emissions)', '2.F.9'],
-    ['Agriculture', '4'],
-    ['A. Intestinal fermentation', '4.A'],
-    ['1. cow', '4.A.1'],
-    ['2. Water buffalo', '4.A.2'],
-    ['3. Sheep (Cotton Sheep)', '4.A.3'],
-    ['4. Sheep (Goat)', '4.A.4'],
-    ['5. Camel and Llama', '4.A.5'],
-    ['6. Horse', '4.A.6'],
-    ['7. Mules and Donkeys', '4.A.7'],
-    ['8. Pig', '4.A.8'],
-    ['9. Poultry', '4.A.9'],
-    ['10. Other livestock (deer)', '4.A.10'],
-    ['B. Livestock manure treatment', '4.B'],
-    ['1. cow', '4.B.1'],
-    ['2. Water buffalo', '4.B.2'],
-    ['3. Sheep (Cotton Sheep)', '4.B.3'],
-    ['4. Sheep (Goat)', '4.B.4'],
-    ['5. Camel and Llama', '4.B.5'],
-    ['6. Horse', '4.B.6'],
-    ['7. Mules and Donkeys', '4.B.7'],
-    ['8. Pig', '4.B.8'],
-    ['9. Poultry', '4.B.9'],
-    ['10. Other livestock (deer)', '4.B.10'],
-    ['C. Rice cultivation', '4.C'],
-    ['1. irrigation', '4.C.1'],
-    ['2. Thousand answers', '4.C.4'],
-    ['D. Cropland soil', '4.D'],
-    ['1. Direct discharge', '4.D.1'],
-    ['2. Ranch, grazing area, manure (manure)', '4.D.2'],
-    ['3. Indirect emissions', '4.D.3'],
-    ['E. Savannah incineration', '4.E'],
-    ['F. Crop residue incineration', '4.F'],
-    ['1. Grains', '4.F.1'],
-    ['2. Beans (beans)', '4.F.2'],
-    ['3. Root vegetables', '4.F.3'],
-    ['4. Sugar cane', '4.F.4'],
-    ['5. Other', '4.F.5'],
-    ['LULUCF', '5'],
-    ['A. Forest land', '5.A'],
-    ['1. Forest land maintained as a forest land', '5.A.1'],  # categories differ from IPCC1996
-    ['2. Forest land converted from other lands', '5.A.2'],  # categories differ from IPCC1996
-    ['3. N2O emissions from nitrogen fertilization in forest areas', '5.A.3'],  # categories differ from IPCC1996
-    ['4. Non-CO2 emission due to drainage in forest areas', '5.A.4'],  # categories differ from IPCC1996
-    ['5. Emissions from biomass combustion in forest areas', '5.A.5'],  # categories differ from IPCC1996
-    ['B. Cropland', '5.B'],
-    ['1. Agricultural land maintained as agricultural land', '5.B.1'],  # categories differ from IPCC1996
-    ['2. Cropland converted from other lands', '5.B.2'],  # categories differ from IPCC1996
-    ['3. N2O emission due to conversion to agricultural land', '5.B.3'],  # categories differ from IPCC1996
-    ['4. CO2 emission from agricultural lime application in agricultural land', '5.B.4'],  # categories differ from IPCC1996
-    ['5. Emissions from biomass combustion in agricultural land', '5.B.5'],  # categories differ from IPCC1996
-    ['C. Grassland', '5.C'],
-    ['1. Grassland maintained as grassland', '5.C.1'],  # categories differ from IPCC1996
-    ['2. Grassland dedicated to Tatoji', '5.C.2'],  # categories differ from IPCC1996
-    ['3. CO2 emission from agricultural lime application in grassland', '5.C.3'],  # categories differ from IPCC1996
-    ['4. Emissions from biomass combustion in grassland', '5.C.4'],  # categories differ from IPCC1996
-    ['D. Wetlands', '5.D'],
-    ['1. Wetlands maintained as wetlands', '5.D.1'],  # categories differ from IPCC1996
-    ['2. Wetlands converted from Tatoji', '5.D.2'],  # categories differ from IPCC1996
-    ['3. Non-CO2 emission due to drainage in wetlands', '5.D.3'],  # categories differ from IPCC1996
-    ['4. Emissions from biomass combustion in wetlands', '5.D.4'],  # categories differ from IPCC1996
-    ['E. Jeongju-ji', '5.E'],
-    ['F. Other land', '5.F'],
-    ['waste', '6'],
-    ['A. Landfill of waste', '6.A'],
-    ['1. Managed landfill', '6.A.1'],
-    ['2. Unmanaged landfill', '6.A.2'],
-    ['B. Sewage water treatment', '6.B'],
-    ['1. Wastewater treatment', '6.B.1'],  # categories differ from IPCC1996
-    ['2. Sewage treatment', '6.B.2'],  # categories differ from IPCC1996
-    ['C. Waste incineration', '6.C'],
-    ['D. Other', '6.D'],
-    ['Memo Item', '\\IGNORE'],
-    ['Field·Sector/Year', '\\IGNORE'],
-    ['C. International bunkering and multinational operations', '\\IGNORE'],
-    ['1. Bunkering', 'M.1'],
-    ['a. International aviation', 'M.1.A'],
-    ['b. International shipping', 'M.1.B'],
-    ['2. Multinational operations', 'M.2'],
-    ['', '\\IGNORE'],
-    ['3. Farmland converted to Tato land', '5.B.3'],  # new codes in 2021 inventory start here
-    ['4. N2O emission due to conversion to agricultural land', '5.B.4'],
-    ['5. CO2 emission from agricultural lime application in agricultural land', '5.B.5'],
-    ['6. Emissions from burning biomass on agricultural land', '5.B.6'],
-    ['G. Others', '5.G'],
+    ["Total emissions", "M.0.EL"],
+    ["Net emissions", "0"],
+    ["energy", "1"],
+    ["A. Fuel combustion", "1.A"],
+    ["1. Energy industry", "1.A.1"],
+    ["a. Public electricity and heat production", "1.A.1.a"],
+    ["b. Oil refining", "1.A.1.b"],
+    ["c. Solid fuel manufacturing and other energy industries", "1.A.1.c"],
+    ["2. Manufacturing and construction", "1.A.2"],
+    ["a. steel", "1.A.2.a"],
+    ["b. Non-ferrous metal", "1.A.2.b"],
+    ["c. chemistry", "1.A.2.c"],
+    ["d. Pulp, paper and printing", "1.A.2.d"],
+    ["e. Food and beverage processing and tobacco manufacturing", "1.A.2.e"],
+    ["f. Etc", "1.A.2.f"],
+    ["  1. Non-metal", "1.A.2.f.1"],
+    ["  2. Assembly metal", "1.A.2.f.2"],
+    ["  3. Wood and timber", "1.A.2.f.3"],
+    ["  4. Construction", "1.A.2.f.4"],
+    ["  5. Textile and leather", "1.A.2.f.5"],
+    ["  6. Other manufacturing", "1.A.2.f.6"],
+    ["3. Transportation", "1.A.3"],
+    ["a. Civil aviation", "1.A.3.a.2"],
+    ["b. Road transport", "1.A.3.b"],
+    ["c. railroad", "1.A.3.c"],
+    ["d. shipping", "1.A.3.d.2"],
+    ["e. Other transport", "1.A.3.e"],
+    ["4. Other", "1.A.4"],
+    ["a. Commercial/Public", "1.A.4.a"],
+    ["b. home", "1.A.4.b"],
+    ["c. Agriculture/Forestry/Fishing", "1.A.4.c"],
+    ["5. Uncategorized", "1.A.5"],
+    ["B. Talu", "1.B"],
+    ["1. Solid fuel", "1.B.1"],
+    ["2. Oil and natural gas", "1.B.2"],
+    ["a. oil", "1.B.2.a"],
+    ["b. Natural gas", "1.B.2.b"],
+    ["Industrial process", "2"],
+    ["A. Mineral industry", "2.A"],
+    ["1. Cement production", "2.A.1"],
+    ["2. Lime production", "2.A.2"],
+    ["3. Limestone and Dolomite Consumption", "2.A.3"],
+    ["4. Soda ash production and consumption", "2.A.4"],
+    ["5. Asphalt roofing", "2.A.5"],
+    ["6. Asphalt road pavement", "2.A.6"],
+    ["B. Chemical industry", "2.B"],
+    ["C. Metal Industry", "2.C"],
+    ["1. Steel production", "2.C.1"],
+    ["2. Ferroalloy production", "2.C.2"],
+    ["3. Aluminum production", "2.C.3"],
+    ["4. SF6 consumption in magnesium production", "2.C.4"],
+    ["D. Other industries", "2.D"],
+    ["E. Production of halocarbons and sulfur hexafluoride", "2.E"],
+    ["1. Emission of by-products", "2.E.1"],
+    ["2. Fugitive discharge", "2.E.2"],
+    ["F. Consumption of halocarbons and sulfur hexafluoride", "2.F"],
+    ["1. Refrigeration and cooling", "2.F.1"],
+    ["2. Foaming agent", "2.F.2"],
+    ["3. Fire extinguisher", "2.F.3"],
+    ["4. Aerosol", "2.F.4"],
+    ["5. Solvent", "2.F.5"],
+    ["6. Use of ODS substitutes for other purposes", "2.F.6"],
+    ["7. Semiconductor manufacturing", "2.F.7"],
+    ["8. Heavy electric machine", "2.F.8"],
+    ["9. Others (potential emissions)", "2.F.9"],
+    ["Agriculture", "4"],
+    ["A. Intestinal fermentation", "4.A"],
+    ["1. cow", "4.A.1"],
+    ["2. Water buffalo", "4.A.2"],
+    ["3. Sheep (Cotton Sheep)", "4.A.3"],
+    ["4. Sheep (Goat)", "4.A.4"],
+    ["5. Camel and Llama", "4.A.5"],
+    ["6. Horse", "4.A.6"],
+    ["7. Mules and Donkeys", "4.A.7"],
+    ["8. Pig", "4.A.8"],
+    ["9. Poultry", "4.A.9"],
+    ["10. Other livestock (deer)", "4.A.10"],
+    ["B. Livestock manure treatment", "4.B"],
+    ["1. cow", "4.B.1"],
+    ["2. Water buffalo", "4.B.2"],
+    ["3. Sheep (Cotton Sheep)", "4.B.3"],
+    ["4. Sheep (Goat)", "4.B.4"],
+    ["5. Camel and Llama", "4.B.5"],
+    ["6. Horse", "4.B.6"],
+    ["7. Mules and Donkeys", "4.B.7"],
+    ["8. Pig", "4.B.8"],
+    ["9. Poultry", "4.B.9"],
+    ["10. Other livestock (deer)", "4.B.10"],
+    ["C. Rice cultivation", "4.C"],
+    ["1. irrigation", "4.C.1"],
+    ["2. Thousand answers", "4.C.4"],
+    ["D. Cropland soil", "4.D"],
+    ["1. Direct discharge", "4.D.1"],
+    ["2. Ranch, grazing area, manure (manure)", "4.D.2"],
+    ["3. Indirect emissions", "4.D.3"],
+    ["E. Savannah incineration", "4.E"],
+    ["F. Crop residue incineration", "4.F"],
+    ["1. Grains", "4.F.1"],
+    ["2. Beans (beans)", "4.F.2"],
+    ["3. Root vegetables", "4.F.3"],
+    ["4. Sugar cane", "4.F.4"],
+    ["5. Other", "4.F.5"],
+    ["LULUCF", "5"],
+    ["A. Forest land", "5.A"],
+    [
+        "1. Forest land maintained as a forest land",
+        "5.A.1",
+    ],  # categories differ from IPCC1996
+    [
+        "2. Forest land converted from other lands",
+        "5.A.2",
+    ],  # categories differ from IPCC1996
+    [
+        "3. N2O emissions from nitrogen fertilization in forest areas",
+        "5.A.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Non-CO2 emission due to drainage in forest areas",
+        "5.A.4",
+    ],  # categories differ from IPCC1996
+    [
+        "5. Emissions from biomass combustion in forest areas",
+        "5.A.5",
+    ],  # categories differ from IPCC1996
+    ["B. Cropland", "5.B"],
+    [
+        "1. Agricultural land maintained as agricultural land",
+        "5.B.1",
+    ],  # categories differ from IPCC1996
+    [
+        "2. Cropland converted from other lands",
+        "5.B.2",
+    ],  # categories differ from IPCC1996
+    [
+        "3. N2O emission due to conversion to agricultural land",
+        "5.B.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. CO2 emission from agricultural lime application in agricultural land",
+        "5.B.4",
+    ],  # categories differ from IPCC1996
+    [
+        "5. Emissions from biomass combustion in agricultural land",
+        "5.B.5",
+    ],  # categories differ from IPCC1996
+    ["C. Grassland", "5.C"],
+    [
+        "1. Grassland maintained as grassland",
+        "5.C.1",
+    ],  # categories differ from IPCC1996
+    ["2. Grassland dedicated to Tatoji", "5.C.2"],  # categories differ from IPCC1996
+    [
+        "3. CO2 emission from agricultural lime application in grassland",
+        "5.C.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Emissions from biomass combustion in grassland",
+        "5.C.4",
+    ],  # categories differ from IPCC1996
+    ["D. Wetlands", "5.D"],
+    ["1. Wetlands maintained as wetlands", "5.D.1"],  # categories differ from IPCC1996
+    ["2. Wetlands converted from Tatoji", "5.D.2"],  # categories differ from IPCC1996
+    [
+        "3. Non-CO2 emission due to drainage in wetlands",
+        "5.D.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Emissions from biomass combustion in wetlands",
+        "5.D.4",
+    ],  # categories differ from IPCC1996
+    ["E. Jeongju-ji", "5.E"],
+    ["F. Other land", "5.F"],
+    ["waste", "6"],
+    ["A. Landfill of waste", "6.A"],
+    ["1. Managed landfill", "6.A.1"],
+    ["2. Unmanaged landfill", "6.A.2"],
+    ["B. Sewage water treatment", "6.B"],
+    ["1. Wastewater treatment", "6.B.1"],  # categories differ from IPCC1996
+    ["2. Sewage treatment", "6.B.2"],  # categories differ from IPCC1996
+    ["C. Waste incineration", "6.C"],
+    ["D. Other", "6.D"],
+    ["Memo Item", "\\IGNORE"],
+    ["Field·Sector/Year", "\\IGNORE"],
+    ["C. International bunkering and multinational operations", "\\IGNORE"],
+    ["1. Bunkering", "M.1"],
+    ["a. International aviation", "M.1.A"],
+    ["b. International shipping", "M.1.B"],
+    ["2. Multinational operations", "M.2"],
+    ["", "\\IGNORE"],
+    [
+        "3. Farmland converted to Tato land",
+        "5.B.3",
+    ],  # new codes in 2021 inventory start here
+    ["4. N2O emission due to conversion to agricultural land", "5.B.4"],
+    [
+        "5. CO2 emission from agricultural lime application in agricultural land",
+        "5.B.5",
+    ],
+    ["6. Emissions from burning biomass on agricultural land", "5.B.6"],
+    ["G. Others", "5.G"],
 ]
 ]
 cat_name_translations = dict(zip(original_names, [cat[0] for cat in translations]))
 cat_name_translations = dict(zip(original_names, [cat[0] for cat in translations]))
 cat_codes = dict(zip(original_names, [cat[1] for cat in translations]))
 cat_codes = dict(zip(original_names, [cat[1] for cat in translations]))
 
 
 remove_cats = [
 remove_cats = [
-    '1.A.1.a', '1.A.1.b', '1.A.1.c', '1.A.2.f',
-    '2.A', '2.D',
-    '2.F', '2.G',
-    '4.C.1', '4.C.4',
-    '4.D',
-    '4.F.1', '4.F.2', '4.F.3', '4.F.4', '4.F.5',  # detail not in 2006 categories
-    '5.A', '5.A.1', '5.A.2', '5.A.3', '5.A.4', '5.A.5',  # don't not match IPCC
+    "1.A.1.a",
+    "1.A.1.b",
+    "1.A.1.c",
+    "1.A.2.f",
+    "2.A",
+    "2.D",
+    "2.F",
+    "2.G",
+    "4.C.1",
+    "4.C.4",
+    "4.D",
+    "4.F.1",
+    "4.F.2",
+    "4.F.3",
+    "4.F.4",
+    "4.F.5",  # detail not in 2006 categories
+    "5.A",
+    "5.A.1",
+    "5.A.2",
+    "5.A.3",
+    "5.A.4",
+    "5.A.5",  # don't not match IPCC
     # categories
     # categories
-    '5.B', '5.B.1', '5.B.2', '5.B.3', '5.B.4', '5.B.5',
-    '5.C', '5.C.1', '5.C.2', '5.C.3', '5.C.4',
-    '5.D', '5.D.1', '5.D.2', '5.D.3', '5.D.4',
-    '5.E', '5.F',
-    '5.G', '5.B.6', # for 2021 NIR
+    "5.B",
+    "5.B.1",
+    "5.B.2",
+    "5.B.3",
+    "5.B.4",
+    "5.B.5",
+    "5.C",
+    "5.C.1",
+    "5.C.2",
+    "5.C.3",
+    "5.C.4",
+    "5.D",
+    "5.D.1",
+    "5.D.2",
+    "5.D.3",
+    "5.D.4",
+    "5.E",
+    "5.F",
+    "5.G",
+    "5.B.6",  # for 2021 NIR
 ]
 ]
 
 
 aggregate_before_mapping = {
 aggregate_before_mapping = {
-    '2006.2.D.4': {'sources': ['2.A.5', '2.A.6'], 'name': 'Other'},
-    '2006.3.C.4': {'sources': ['4.D.1', '4.D.2'],
-                   'name': 'Direct N2O Emissions from Managed Soils'},
-    '2006.M.3C1AG': {'sources': ['4.E', '4.F'], 'name': 'Biomass burning Agriculture'},
-    '2006.1.A.2.m': {'sources': ['1.A.2.f.2', '1.A.2.f.6'], 'name': 'Other'},
+    "2006.2.D.4": {"sources": ["2.A.5", "2.A.6"], "name": "Other"},
+    "2006.3.C.4": {
+        "sources": ["4.D.1", "4.D.2"],
+        "name": "Direct N2O Emissions from Managed Soils",
+    },
+    "2006.M.3C1AG": {"sources": ["4.E", "4.F"], "name": "Biomass burning Agriculture"},
+    "2006.1.A.2.m": {"sources": ["1.A.2.f.2", "1.A.2.f.6"], "name": "Other"},
 }
 }
 
 
 cat_mapping = {
 cat_mapping = {
-    '1.A.2.f.1': '1.A.2.f',
-    '1.A.2.f.3': '1.A.2.j',
-    '1.A.2.f.4': '1.A.2.k',
-    '1.A.2.f.5': '1.A.2.l',
-    '2006.1.A.2.m': '1.A.2.m',
-    '2.A.4': '2.B.7',  # add to 2.B
-    '2.A.3': '2.A.4',
-    '2.D': '2.H',
-    '2006.2.D.4': '2.D.4',
-    '2.E': '2.B.9',  # add to 2.B
-    '2.E.1': '2.B.9.a',
-    '2.E.2': '2.B.9.b',
+    "1.A.2.f.1": "1.A.2.f",
+    "1.A.2.f.3": "1.A.2.j",
+    "1.A.2.f.4": "1.A.2.k",
+    "1.A.2.f.5": "1.A.2.l",
+    "2006.1.A.2.m": "1.A.2.m",
+    "2.A.4": "2.B.7",  # add to 2.B
+    "2.A.3": "2.A.4",
+    "2.D": "2.H",
+    "2006.2.D.4": "2.D.4",
+    "2.E": "2.B.9",  # add to 2.B
+    "2.E.1": "2.B.9.a",
+    "2.E.2": "2.B.9.b",
     #    '2.F', # remove?
     #    '2.F', # remove?
-    '2.F.1': '2.F.1',  # just added here to avoid confusion
+    "2.F.1": "2.F.1",  # just added here to avoid confusion
     #    '2.F.2', '2.F.3', '2.F.4', '2.F.5',
     #    '2.F.2', '2.F.3', '2.F.4', '2.F.5',
-    '2.F.6': '2.E_1',
-    '2.F.7': '2.E_2',
-    '2.F.8': '2.G.1',
-    '2.F.9': '2.G.2',
-    '4': 'M.AG',
-    '4.A': '3.A.1',
-    '4.A.1': '3.A.1.a',
-    '4.A.2': '3.A.1.b',
-    '4.A.3': '3.A.1.c',
-    '4.A.4': '3.A.1.d',
-    '4.A.5': '3.A.1.e',
-    '4.A.6': '3.A.1.f',
-    '4.A.7': '3.A.1.g',
-    '4.A.8': '3.A.1.h',
-    '4.A.9': '3.A.1.i',
-    '4.A.10': '3.A.1.j',
-    '4.B': '3.A.2',
-    '4.B.1': '3.A.2.a',
-    '4.B.2': '3.A.2.b',
-    '4.B.3': '3.A.2.c',
-    '4.B.4': '3.A.2.d',
-    '4.B.5': '3.A.2.e',
-    '4.B.6': '3.A.2.f',
-    '4.B.7': '3.A.2.g',
-    '4.B.8': '3.A.2.h',
-    '4.B.9': '3.A.2.i',
-    '4.B.10': '3.A.2.j',
-    '4.C': '3.C.7',
-    '2006.3.C.4': '3.C.4',
-    '4.D.3': '3.C.5',
-    '2006.M.3C1AG': 'M.3.C.1.AG',
-    '5': 'M.LULUCF',
-    '6': '4',
-    '6.A': '4.A',
-    '6.A.1': '4.A.1',
-    '6.A.2': '4.A.2',
-    '6.B': '4.D',
-    '6.B.1': '4.D.1',
-    '6.B.2': '4.D.2',
-    '6.C': '4.C.1',
-    '6.D': '4.E',
-    'M.1': 'M.BK',
-    'M.1.A': 'M.BK.A',
-    'M.1.B': 'M.BK.M',
+    "2.F.6": "2.E_1",
+    "2.F.7": "2.E_2",
+    "2.F.8": "2.G.1",
+    "2.F.9": "2.G.2",
+    "4": "M.AG",
+    "4.A": "3.A.1",
+    "4.A.1": "3.A.1.a",
+    "4.A.2": "3.A.1.b",
+    "4.A.3": "3.A.1.c",
+    "4.A.4": "3.A.1.d",
+    "4.A.5": "3.A.1.e",
+    "4.A.6": "3.A.1.f",
+    "4.A.7": "3.A.1.g",
+    "4.A.8": "3.A.1.h",
+    "4.A.9": "3.A.1.i",
+    "4.A.10": "3.A.1.j",
+    "4.B": "3.A.2",
+    "4.B.1": "3.A.2.a",
+    "4.B.2": "3.A.2.b",
+    "4.B.3": "3.A.2.c",
+    "4.B.4": "3.A.2.d",
+    "4.B.5": "3.A.2.e",
+    "4.B.6": "3.A.2.f",
+    "4.B.7": "3.A.2.g",
+    "4.B.8": "3.A.2.h",
+    "4.B.9": "3.A.2.i",
+    "4.B.10": "3.A.2.j",
+    "4.C": "3.C.7",
+    "2006.3.C.4": "3.C.4",
+    "4.D.3": "3.C.5",
+    "2006.M.3C1AG": "M.3.C.1.AG",
+    "5": "M.LULUCF",
+    "6": "4",
+    "6.A": "4.A",
+    "6.A.1": "4.A.1",
+    "6.A.2": "4.A.2",
+    "6.B": "4.D",
+    "6.B.1": "4.D.1",
+    "6.B.2": "4.D.2",
+    "6.C": "4.C.1",
+    "6.D": "4.E",
+    "M.1": "M.BK",
+    "M.1.A": "M.BK.A",
+    "M.1.B": "M.BK.M",
 }
 }
 
 
 aggregate_after_mapping = {
 aggregate_after_mapping = {
-    '1.A.3.a': {'sources': ['1.A.3.a.2'], 'name': 'Civil Aviation'},  # aviation
-    '1.A.3.d': {'sources': ['1.A.3.d.2'], 'name': 'Water-borne Navigation'},  # shipping
-    '2.A': {'sources': ['2.A.1', '2.A.2', '2.A.4', '2.A.5', '2.A.6'],
-            'name': 'Mineral Industry'},
-    '2.B': {'sources': ['2.B', '2.B.7', '2.B.9'], 'name': 'Chemical Industry'},
-    '2.D': {'sources': ['2.D.4'], 'name': 'Other'},
-    '2.E': {'sources': ['2.E_1', '2.E_2'], 'name': 'Electronics Industry'},
-    '2.F': {'sources': ['2.F.1', '2.F.2', '2.F.3', '2.F.4', '2.F.5'],
-            'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-    '2.G': {'sources': ['2.G.1', '2.G.2'], 'name': 'Other Product Manufacture and Use'},
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.C': {'sources': ['3.C.4', '3.C.5', '3.C.7'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.4', '3.C.5', '3.C.7'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land ('
-                         'Agriculture)'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
-    '4.C': {'sources': ['4.C.1'], 'name': 'Incineration and Open Burning of Waste'},
+    "1.A.3.a": {"sources": ["1.A.3.a.2"], "name": "Civil Aviation"},  # aviation
+    "1.A.3.d": {"sources": ["1.A.3.d.2"], "name": "Water-borne Navigation"},  # shipping
+    "2.A": {
+        "sources": ["2.A.1", "2.A.2", "2.A.4", "2.A.5", "2.A.6"],
+        "name": "Mineral Industry",
+    },
+    "2.B": {"sources": ["2.B", "2.B.7", "2.B.9"], "name": "Chemical Industry"},
+    "2.D": {"sources": ["2.D.4"], "name": "Other"},
+    "2.E": {"sources": ["2.E_1", "2.E_2"], "name": "Electronics Industry"},
+    "2.F": {
+        "sources": ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5"],
+        "name": "Product uses as Substitutes for Ozone Depleting Substances",
+    },
+    "2.G": {"sources": ["2.G.1", "2.G.2"], "name": "Other Product Manufacture and Use"},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.C": {
+        "sources": ["3.C.4", "3.C.5", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.4", "3.C.5", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land ("
+        "Agriculture)",
+    },
+    "M.AG.ELV": {"sources": ["M.3.C.AG"], "name": "Agriculture excluding livestock"},
+    "4.C": {"sources": ["4.C.1"], "name": "Incineration and Open Burning of Waste"},
 }
 }
 
 
 coords_terminologies_2006 = {
 coords_terminologies_2006 = {
@@ -422,27 +522,35 @@ filter_remove_2006 = {
     },
     },
     "livestock": {  # temp until double cat name problem is solved
     "livestock": {  # temp until double cat name problem is solved
         "category (IPCC2006_PRIMAP)": [
         "category (IPCC2006_PRIMAP)": [
-            '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-            '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+            "4.B.1",
+            "4.B.10",
+            "4.B.2",
+            "4.B.3",
+            "4.B.4",
+            "4.B.5",
+            "4.B.6",
+            "4.B.7",
+            "4.B.8",
+            "4.B.9",
         ]
         ]
     },
     },
-    "fmap": {
-        "category (IPCC2006_PRIMAP)": remove_cats
-    },
+    "fmap": {"category (IPCC2006_PRIMAP)": remove_cats},
     "f_bef_map": {
     "f_bef_map": {
         "category (IPCC2006_PRIMAP)": [
         "category (IPCC2006_PRIMAP)": [
-            '2.A.5', '2.A.6',  # combined to 2006.2.D.4
-            '4.D.1', '4.D.2',  # combined to 2006.3.C.4
-            '4.E', '4.F',  # 2006.M.3.C.1.AG
-            '1.A.2.f.2', '1.A.2.f.6',  # 2006.1.A.2.m
+            "2.A.5",
+            "2.A.6",  # combined to 2006.2.D.4
+            "4.D.1",
+            "4.D.2",  # combined to 2006.3.C.4
+            "4.E",
+            "4.F",  # 2006.M.3.C.1.AG
+            "1.A.2.f.2",
+            "1.A.2.f.6",  # 2006.1.A.2.m
         ]
         ]
-    }
+    },
 }
 }
 
 
 filter_remove_after_agg = {
 filter_remove_after_agg = {
     "tempCats": {
     "tempCats": {
-        "category (IPCC2006_PRIMAP)": [
-            "2.E_1", "2.E_2"
-        ],
+        "category (IPCC2006_PRIMAP)": ["2.E_1", "2.E_2"],
     },
     },
 }
 }

+ 125 - 76
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py

@@ -1,12 +1,18 @@
-# this script reads data from Korea's 2021 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's 2021 inventory from xlsx
+
+This script reads data from Korea's 2021 national inventory
+Data are read from the xlsx file
+
+"""
+
 
 
 import os
 import os
 import sys
 import sys
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_kor_bur4 import (
+from config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_after_mapping,
     aggregate_before_mapping,
     aggregate_before_mapping,
     cat_codes,
     cat_codes,
@@ -24,42 +30,43 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2021-Inventory'
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2021-Inventory"
+    )
+    output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'KOR_2021-Inventory_2021_'
+    output_filename = "KOR_2021-Inventory_2021_"
 
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2019).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2019).xlsx"
     years_to_read = range(1990, 2019 + 1)
     years_to_read = range(1990, 2019 + 1)
 
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2019 - 1990 + 3)
     cols_to_read = range(1, 2019 - 1990 + 3)
 
 
     # columns for category code and original category name
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
 
     sheet_metadata = {
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
     }
 
 
     # definitions for conversion to interchange format
     # definitions for conversion to interchange format
@@ -73,7 +80,7 @@ if __name__ == "__main__":
 
 
     add_coords_cols = {
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
     }
 
 
     coords_terminologies = {
     coords_terminologies = {
@@ -99,12 +106,20 @@ if __name__ == "__main__":
         "f1": {
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
         },
-        "livestock": { # temp until double cat name problem is solved
+        "livestock": {  # temp until double cat name problem is solved
             "category (IPCC1996_KOR_INV)": [
             "category (IPCC1996_KOR_INV)": [
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
             ]
             ]
-        }
+        },
     }
     }
 
 
     filter_keep = {}
     filter_keep = {}
@@ -115,7 +130,8 @@ if __name__ == "__main__":
         "contact": "mail@johannes-guetschow.de",
         "contact": "mail@johannes-guetschow.de",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2021",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2021",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "comment": "Read fom xlsx file by Johannes Gütschow",
-        "institution": "Republic of Korea, Ministry of Environment, Greenhouse Gas Inventory and Research Center",
+        "institution": "Republic of Korea, Ministry of Environment, "
+        "Greenhouse Gas Inventory and Research Center",
     }
     }
 
 
     cols_for_space_stripping = []
     cols_for_space_stripping = []
@@ -135,11 +151,17 @@ if __name__ == "__main__":
 
 
     for sheet in sheets_to_read:
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=146, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=146,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # df_current = df_current.set_index(index_cols)
         # add columns
         # add columns
@@ -153,7 +175,7 @@ if __name__ == "__main__":
 
 
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
 
     # create copies of category col for further processing
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
     df_all["orig_cat_name"] = df_all["category"]
@@ -172,20 +194,22 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-    copy_df=True, # we need the unchanged DF for the conversion step
-        )
+        copy_df=True,  # we need the unchanged DF for the conversion step
+    )
 
 
     filter_data(data_if, filter_remove=filter_remove)
     filter_data(data_if, filter_remove=filter_remove)
 
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
-    data_pm2 = data_pm2.reset_coords(["orig_cat_name", "cat_name_translation"], drop=True)
+    data_pm2 = data_pm2.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
     # ###
     # ###
@@ -193,17 +217,20 @@ if __name__ == "__main__":
     # ###
     # ###
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
-    #pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    #data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # conversion to ipcc 2006 categories
     # conversion to ipcc 2006 categories
     # ###
     # ###
-
-
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         df_all,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
@@ -216,21 +243,23 @@ if __name__ == "__main__":
         copy_df=True,  # don't mess up the dataframe when testing
         copy_df=True,  # don't mess up the dataframe when testing
     )
     )
 
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     # agg before mapping
     # agg before mapping
 
 
     for cat_to_agg in aggregate_before_mapping:
     for cat_to_agg in aggregate_before_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_before_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_before_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -238,20 +267,25 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_before_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
             data_if_2006 = pd.concat([data_if_2006, df_combine])
@@ -268,17 +302,19 @@ if __name__ == "__main__":
     # agg after mapping
     # agg after mapping
 
 
     for cat_to_agg in aggregate_after_mapping:
     for cat_to_agg in aggregate_after_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_after_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_after_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -286,36 +322,49 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_after_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
             data_if_2006 = pd.concat([data_if_2006, df_combine])
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
-    data_pm2_2006 = data_pm2_2006.reset_coords(["orig_cat_name", "cat_name_translation"],
-                                           drop=True)
+    data_pm2_2006 = data_pm2_2006.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     # save IPCC2006 data
     # save IPCC2006 data
 
 
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 140 - 82
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py

@@ -1,12 +1,17 @@
-# this script reads data from Korea's 2021 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's 2021 inventory from xlsx
+
+This script reads data from Korea's 2022 national inventory
+Data are read from the xlsx file
+
+"""
 
 
 import os
 import os
 import sys
 import sys
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_kor_bur4 import (
+from config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_after_mapping,
     aggregate_before_mapping,
     aggregate_before_mapping,
     cat_codes,
     cat_codes,
@@ -24,42 +29,43 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2022-Inventory'
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2022-Inventory"
+    )
+    output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'KOR_2022-Inventory_2022_'
+    output_filename = "KOR_2022-Inventory_2022_"
 
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2020).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2020).xlsx"
     years_to_read = range(1990, 2020 + 1)
     years_to_read = range(1990, 2020 + 1)
 
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2020 - 1990 + 3)
     cols_to_read = range(1, 2020 - 1990 + 3)
 
 
     # columns for category code and original category name
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
 
     sheet_metadata = {
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
     }
 
 
     # definitions for conversion to interchange format
     # definitions for conversion to interchange format
@@ -73,7 +79,7 @@ if __name__ == "__main__":
 
 
     add_coords_cols = {
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
     }
 
 
     coords_terminologies = {
     coords_terminologies = {
@@ -99,12 +105,20 @@ if __name__ == "__main__":
         "f1": {
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
         },
-        "livestock": { # temp until double cat name problem is solved
+        "livestock": {  # temp until double cat name problem is solved
             "category (IPCC1996_KOR_INV)": [
             "category (IPCC1996_KOR_INV)": [
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
             ]
             ]
-        }
+        },
     }
     }
 
 
     filter_keep = {}
     filter_keep = {}
@@ -115,11 +129,10 @@ if __name__ == "__main__":
         "contact": "mail@johannes-guetschow.de",
         "contact": "mail@johannes-guetschow.de",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2022",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2022",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "comment": "Read fom xlsx file by Johannes Gütschow",
-        "institution": "Republic of Korea, Ministry of Environment, Greenhouse Gas Inventory and Research Center",
+        "institution": "Republic of Korea, Ministry of Environment, "
+        "Greenhouse Gas Inventory and Research Center",
     }
     }
 
 
-
-
     cols_for_space_stripping = []
     cols_for_space_stripping = []
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
@@ -137,11 +150,17 @@ if __name__ == "__main__":
 
 
     for sheet in sheets_to_read:
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=146, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=146,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # df_current = df_current.set_index(index_cols)
         # make sure all col headers are str
         # make sure all col headers are str
@@ -157,14 +176,12 @@ if __name__ == "__main__":
 
 
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
 
     # create copies of category col for further processing
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
     df_all["orig_cat_name"] = df_all["category"]
     df_all["cat_name_translation"] = df_all["category"]
     df_all["cat_name_translation"] = df_all["category"]
 
 
-
-
     # ###
     # ###
     # convert to PRIMAP2 interchange format
     # convert to PRIMAP2 interchange format
     # ###
     # ###
@@ -175,20 +192,22 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-        copy_df=True, # we need the unchanged DF for the conversion step
-        )
+        copy_df=True,  # we need the unchanged DF for the conversion step
+    )
 
 
     filter_data(data_if, filter_remove=filter_remove)
     filter_data(data_if, filter_remove=filter_remove)
 
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
-    data_pm2 = data_pm2.reset_coords(["orig_cat_name", "cat_name_translation"], drop=True)
+    data_pm2 = data_pm2.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
     # ###
     # ###
@@ -196,17 +215,21 @@ if __name__ == "__main__":
     # ###
     # ###
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # conversion to ipcc 2006 categories
     # conversion to ipcc 2006 categories
     # ###
     # ###
 
 
-
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         df_all,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
@@ -219,21 +242,23 @@ if __name__ == "__main__":
         copy_df=True,  # don't mess up the dataframe when testing
         copy_df=True,  # don't mess up the dataframe when testing
     )
     )
 
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     # agg before mapping
     # agg before mapping
 
 
     for cat_to_agg in aggregate_before_mapping:
     for cat_to_agg in aggregate_before_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_before_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_before_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -241,20 +266,32 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name", "cat_name_translation"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_before_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
             data_if_2006 = pd.concat([data_if_2006, df_combine])
@@ -271,17 +308,19 @@ if __name__ == "__main__":
     # agg after mapping
     # agg after mapping
 
 
     for cat_to_agg in aggregate_after_mapping:
     for cat_to_agg in aggregate_after_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_after_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_after_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -289,37 +328,56 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name", "cat_name_translation"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_after_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
+            )
 
 
             df_combine = df_combine.reset_index()
             df_combine = df_combine.reset_index()
 
 
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
             data_if_2006 = pd.concat([data_if_2006, df_combine])
         else:
         else:
             print(f"no data to aggregate category {cat_to_agg}")
             print(f"no data to aggregate category {cat_to_agg}")
 
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
-    data_pm2_2006 = data_pm2_2006.reset_coords(["orig_cat_name", "cat_name_translation"],
-                                           drop=True)
+    data_pm2_2006 = data_pm2_2006.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     # save IPCC2006 data
     # save IPCC2006 data
 
 
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 75 - 47
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py

@@ -1,12 +1,17 @@
-# this script reads data from Korea's BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's BUR4 from xlsx
+
+This script reads data from Korea's 2020 national inventory which is underlying BUR4
+Data are read from the xlsx file
+
+"""
 
 
 import os
 import os
 import sys
 import sys
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_kor_bur4 import cat_codes, cat_name_translations
+from config_kor_bur4 import cat_codes, cat_name_translations
 from primap2.pm2io._data_reading import filter_data
 from primap2.pm2io._data_reading import filter_data
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -15,42 +20,43 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2020-Inventory'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2020-Inventory"
+    )
+    output_folder = extracted_data_path / "UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'KOR_BUR4_2021_'
+    output_filename = "KOR_BUR4_2021_"
 
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2018).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2018).xlsx"
     years_to_read = range(1990, 2018 + 1)
     years_to_read = range(1990, 2018 + 1)
 
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2018 - 1990 + 3)
     cols_to_read = range(1, 2018 - 1990 + 3)
 
 
     # columns for category code and original category name
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
 
     sheet_metadata = {
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
     }
 
 
     # definitions for conversion to interchange format
     # definitions for conversion to interchange format
@@ -64,7 +70,7 @@ if __name__ == "__main__":
 
 
     add_coords_cols = {
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
     }
 
 
     coords_terminologies = {
     coords_terminologies = {
@@ -90,21 +96,32 @@ if __name__ == "__main__":
         "f1": {
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
         },
-        "livestock": { # temp until double cat name problem is solved
-            "category (IPCC1996_KOR_INV)": {
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
-            }
-        }
+        "livestock": {  # temp until double cat name problem is solved
+            "category (IPCC1996_KOR_INV)": [
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
+            ]
+        },
     }
     }
 
 
     filter_keep = {}
     filter_keep = {}
 
 
     meta_data = {
     meta_data = {
-        "references": "https://unfccc.int/documents/418616, http://www.gir.go.kr/home/file/readDownloadFile.do?fileId=4856&fileSeq=2",
+        "references": "https://unfccc.int/documents/418616, "
+        "http://www.gir.go.kr/home/file/readDownloadFile.do?"
+        "fileId=4856&fileSeq=2",
         "rights": "",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
         "contact": "mail@johannes-guetschow.de.de",
-        "title": "Republic of Korea: BUR4 / National Greenhouse Gas Inventory Report 2020",
+        "title": "Republic of Korea: BUR4 / National Greenhouse Gas Inventory Report "
+        "2020",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
     }
@@ -126,11 +143,17 @@ if __name__ == "__main__":
 
 
     for sheet in sheets_to_read:
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=144, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=144,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # df_current = df_current.set_index(index_cols)
         # add columns
         # add columns
@@ -144,7 +167,7 @@ if __name__ == "__main__":
 
 
     df_all = df_all.reset_index(drop=True)
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
 
     # create copies of category col for further processing
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
     df_all["orig_cat_name"] = df_all["category"]
@@ -163,12 +186,12 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+    )
 
 
     filter_data(data_if, filter_remove=filter_remove)
     filter_data(data_if, filter_remove=filter_remove)
 
 
@@ -181,7 +204,12 @@ if __name__ == "__main__":
     # ###
     # ###
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Singapore/__init__.py

@@ -0,0 +1,30 @@
+"""Read Singapore's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'SGP'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=SGP
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 408 - 256
src/unfccc_ghg_data/unfccc_reader/Singapore/config_sgp_bur5.py

@@ -1,152 +1,222 @@
+"""Config for Singapore's BUR5
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 table_def_templates = {
 table_def_templates = {
-    '66_1': {  # 66
-        "area": ['68,743,522,157'],
-        "cols": ['224,280,319,359,399,445,481'],
+    "66_1": {  # 66
+        "area": ["68,743,522,157"],
+        "cols": ["224,280,319,359,399,445,481"],
         "rows_to_fix": {
         "rows_to_fix": {
             # 2: ['and Sink Categories',],
             # 2: ['and Sink Categories',],
-            3: ['1A2 Manufacturing Industries',
-                '1B3 Other Emissions from', '1C - Carbon Dioxide Transport',
-                '2 — INDUSTRIAL PROCESSES AND', '2D - Non-Energy Products from',
-                '2F - Product Uses as Substitutes for',
-                '2G - Other Product Manufacture'],
+            3: [
+                "1A2 Manufacturing Industries",
+                "1B3 Other Emissions from",
+                "1C - Carbon Dioxide Transport",
+                "2 — INDUSTRIAL PROCESSES AND",
+                "2D - Non-Energy Products from",
+                "2F - Product Uses as Substitutes for",
+                "2G - Other Product Manufacture",
+            ],
         },
         },
     },
     },
-    '66_2': {  # 66
-        "area": ['671,744,1117,265'],
-        "cols": ['824,875,912,954,996,1040,1082'],
+    "66_2": {  # 66
+        "area": ["671,744,1117,265"],
+        "cols": ["824,875,912,954,996,1040,1082"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3 — AGRICULTURE, FORESTRY AND', '3C - Aggregate Sources and Non-CO2',
-                '4C - Incineration and Open Burning',
-                '4D -  Wastewater Treatment',
-                '5A - Indirect N2O emissions from the', 'CO2 from Biomass Combustion',
-                ],
+            3: [
+                "3 — AGRICULTURE, FORESTRY AND",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open Burning",
+                "4D -  Wastewater Treatment",
+                "5A - Indirect N2O emissions from the",
+                "CO2 from Biomass Combustion",
+            ],
         },
         },
     },
     },
-    '67_1': {  # 67
-        "area": ['70,727,554,159'],
-        "cols": ['207,254,291,319,356,400,442,468,503'],
+    "67_1": {  # 67
+        "area": ["70,727,554,159"],
+        "cols": ["207,254,291,319,356,400,442,468,503"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2 — INDUSTRIAL PROCESSES', '2A4 Other Process Uses',
-                '2B4 Caprolactam, Glyoxal and', '2B8 Petrochemical and',
-                ],
-            3: ['Total National Emissions',
-                ],
+            2: [
+                "2 — INDUSTRIAL PROCESSES",
+                "2A4 Other Process Uses",
+                "2B4 Caprolactam, Glyoxal and",
+                "2B8 Petrochemical and",
+            ],
+            3: [
+                "Total National Emissions",
+            ],
         },
         },
     },
     },
-    '67_2': {  # 67
-        "area": ['666,725,1150,119'],
-        "cols": ['801,847,889,915,952,996,1036,1063,1098'],
+    "67_2": {  # 67
+        "area": ["666,725,1150,119"],
+        "cols": ["801,847,889,915,952,996,1036,1063,1098"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2D - Non-Energy Products from', '2G - Other Product',
-                '2G2 SF6 and PFCs from', '2H2 Food and Beverages',
-                ],
-            3: ['Total National Emissions', '2E1 Integrated Circuit',
-                '2F - Product Uses as Substitutes for', '2F1 Refrigeration and',
-                ],
+            2: [
+                "2D - Non-Energy Products from",
+                "2G - Other Product",
+                "2G2 SF6 and PFCs from",
+                "2H2 Food and Beverages",
+            ],
+            3: [
+                "Total National Emissions",
+                "2E1 Integrated Circuit",
+                "2F - Product Uses as Substitutes for",
+                "2F1 Refrigeration and",
+            ],
         },
         },
     },
     },
-    '68_1': {  # 68
-        "area": ['66,787,524,217'],
-        "cols": ['205,261,315,366,415,473'],
+    "68_1": {  # 68
+        "area": ["66,787,524,217"],
+        "cols": ["205,261,315,366,415,473"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2 — INDUSTRIAL PROCESSES', '2A4 Other Process Uses',
-                '2B4 Caprolactam, Glyoxal and', '2B8 Petrochemical and',
-                ],
-            3: ['Total National Emissions',
-                ],
+            2: [
+                "2 — INDUSTRIAL PROCESSES",
+                "2A4 Other Process Uses",
+                "2B4 Caprolactam, Glyoxal and",
+                "2B8 Petrochemical and",
+            ],
+            3: [
+                "Total National Emissions",
+            ],
         },
         },
     },
     },
-    '68_2': {  # 68
-        "area": ['666,787,1119,180'],
-        "cols": ['808,854,910,961,1017,1066'],
+    "68_2": {  # 68
+        "area": ["666,787,1119,180"],
+        "cols": ["808,854,910,961,1017,1066"],
         "rows_to_fix": {
         "rows_to_fix": {
-            2: ['2D - Non-Energy Products from',
-                '2F - Product Uses as Substitutes for', '2F1 Refrigeration and Air',
-                '2G2 SF6 and PFCs from Other', '2H2 Food and Beverages',
-                ],
-            3: ['Total National Emissions', '2E1 Integrated Circuit or',
-                '2G - Other Product Manufacture',
-                ],
+            2: [
+                "2D - Non-Energy Products from",
+                "2F - Product Uses as Substitutes for",
+                "2F1 Refrigeration and Air",
+                "2G2 SF6 and PFCs from Other",
+                "2H2 Food and Beverages",
+            ],
+            3: [
+                "Total National Emissions",
+                "2E1 Integrated Circuit or",
+                "2G - Other Product Manufacture",
+            ],
         },
         },
     },
     },
-    '84_1': {  # 84
-        "area": ['70,667,525,112'],
-        "cols": ['193,291,345,396,440,480'],
+    "84_1": {  # 84
+        "area": ["70,667,525,112"],
+        "cols": ["193,291,345,396,440,480"],
         "rows_to_fix": {},
         "rows_to_fix": {},
     },
     },
-    '84_2': {  # 84
-        "area": ['668,667,1115,83'],
-        "cols": ['854,908,954,1001,1038,1073'],
-        "rows_to_fix": { },
+    "84_2": {  # 84
+        "area": ["668,667,1115,83"],
+        "cols": ["854,908,954,1001,1038,1073"],
+        "rows_to_fix": {},
     },
     },
-    '85_1': {  # 85
-        "area": ['70,680,531,170'],
-        "cols": ['275,328,375,414,456,489'],
+    "85_1": {  # 85
+        "area": ["70,680,531,170"],
+        "cols": ["275,328,375,414,456,489"],
         "rows_to_fix": {},
         "rows_to_fix": {},
     },
     },
-    '85_2': {  # 85
-        "area": ['663,675,1117,175'],
-        "cols": ['849,908,954,1001,1045,1073'],
+    "85_2": {  # 85
+        "area": ["663,675,1117,175"],
+        "cols": ["849,908,954,1001,1045,1073"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['3C — Aggregate Sources and Non-CO2',
-                '3C4 - Direct N2O Emissions from', '3C5 - Indirect N2O Emissions from',
-                '3C6 - Indirect N2O Emissions from']
+            3: [
+                "3C — Aggregate Sources and Non-CO2",
+                "3C4 - Direct N2O Emissions from",
+                "3C5 - Indirect N2O Emissions from",
+                "3C6 - Indirect N2O Emissions from",
+            ]
         },
         },
     },
     },
-    '92': {  # 92
-        "area": ['72,672,514,333'],
-        "cols": ['228,275,319,361,398,438,489'],
+    "92": {  # 92
+        "area": ["72,672,514,333"],
+        "cols": ["228,275,319,361,398,438,489"],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['4A1 Managed Waste',
-                '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4C - Incineration and', '4D - Wastewater Treatment',
-                '4D1 Domestic Wastewater', '4D2 Industrial Wastewater']
+            3: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4C - Incineration and",
+                "4D - Wastewater Treatment",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ]
         },
         },
     },
     },
-    '95_1': {  # 95
-        "area": ['70,731,507,149'],
-        "cols": ['233,307,375,452'],
+    "95_1": {  # 95
+        "area": ["70,731,507,149"],
+        "cols": ["233,307,375,452"],
         "drop_rows": [0, 1, 2, 3],
         "drop_rows": [0, 1, 2, 3],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['Total (Net)', '1A2 Manufacturing Industries',
-                '2 — INDUSTRIAL PROCESSES', '3 — AGRICULTURE, FORESTRY',
-                '3C - Aggregate Sources and Non-CO2', '4C - Incineration and Open',
-                'Clinical Waste', '4D - Wastewater Treatment',
-                'CO2 from Biomass Combustion for']
+            3: [
+                "Total (Net)",
+                "1A2 Manufacturing Industries",
+                "2 — INDUSTRIAL PROCESSES",
+                "3 — AGRICULTURE, FORESTRY",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open",
+                "Clinical Waste",
+                "4D - Wastewater Treatment",
+                "CO2 from Biomass Combustion for",
+            ]
         },
         },
         "header": {
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories',
-                       'Net CO2', 'CH4', 'N2O', 'HFCs'],
-            'unit': ['', 'Gg', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "Net CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+            ],
+            "unit": ["", "Gg", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
         },
     },
     },
-    '95_2': {  # 95
-        "area": ['666,731,1103,149'],
-        "cols": ['829,903,971,1048'],
+    "95_2": {  # 95
+        "area": ["666,731,1103,149"],
+        "cols": ["829,903,971,1048"],
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "rows_to_fix": {
         "rows_to_fix": {
-            3: ['Total (Net)', '1A2 Manufacturing Industries',
-                '2 — INDUSTRIAL PROCESSES', '3 — AGRICULTURE, FORESTRY',
-                '3C - Aggregate Sources and Non-CO2', '4C - Incineration and Open',
-                'Clinical Waste', '4D - Wastewater Treatment',
-                'CO2 from Biomass Combustion for']
+            3: [
+                "Total (Net)",
+                "1A2 Manufacturing Industries",
+                "2 — INDUSTRIAL PROCESSES",
+                "3 — AGRICULTURE, FORESTRY",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open",
+                "Clinical Waste",
+                "4D - Wastewater Treatment",
+                "CO2 from Biomass Combustion for",
+            ]
         },
         },
         "header": {
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories',
-                       'PFCs', 'SF6', 'NF3', 'Total (Net) National Emissions'],
-            'unit': ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "PFCs",
+                "SF6",
+                "NF3",
+                "Total (Net) National Emissions",
+            ],
+            "unit": ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
         },
     },
     },
 }
 }
 
 
 table_defs = {
 table_defs = {
-    '66': {
-        "templates": ['66_1', '66_2'],
+    "66": {
+        "templates": ["66_1", "66_2"],
         # "header_rows": [0, 1],
         # "header_rows": [0, 1],
         "header": {
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'Net CO2',
-                       'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NF3'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "Net CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+                "PFCs",
+                "SF6",
+                "NF3",
+            ],
+            "unit": ["", "Gg", "Gg", "Gg", "GgCO2eq", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
         },
         "drop_rows": [0, 1, 2, 3],
         "drop_rows": [0, 1, 2, 3],
         # "drop_cols": ['NF3', 'SF6'],
         # "drop_cols": ['NF3', 'SF6'],
@@ -155,13 +225,22 @@ table_defs = {
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
         "coords_value_mapping": "2018",
     },
     },
-    '67': {
-        "templates": ['67_1', '67_2'],
+    "67": {
+        "templates": ["67_1", "67_2"],
         "header": {
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'HFC-23', 'HFC-32',
-                       'HFC-41', 'HFC-125', 'HFC-134a', 'HFC-143a', 'HFC-152a',
-                       'HFC-227ea', 'HFC-43-10mee'],
-            'unit': ['', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "HFC-23",
+                "HFC-32",
+                "HFC-41",
+                "HFC-125",
+                "HFC-134a",
+                "HFC-143a",
+                "HFC-152a",
+                "HFC-227ea",
+                "HFC-43-10mee",
+            ],
+            "unit": ["", "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg"],
         },
         },
         "drop_rows": [0, 1, 2, 3],
         "drop_rows": [0, 1, 2, 3],
         # "drop_cols": ['NF3', 'SF6'],
         # "drop_cols": ['NF3', 'SF6'],
@@ -170,24 +249,31 @@ table_defs = {
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018_fgases",
         "coords_value_mapping": "2018_fgases",
     },
     },
-    '68': {
-        "templates": ['68_1', '68_2'],
+    "68": {
+        "templates": ["68_1", "68_2"],
         "header": {
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'PFC-14',
-                       'PFC-116', 'PFC-218', 'PFC-318', 'SF6', 'NF3'],
-            'unit': ['', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "PFC-14",
+                "PFC-116",
+                "PFC-218",
+                "PFC-318",
+                "SF6",
+                "NF3",
+            ],
+            "unit": ["", "kg", "kg", "kg", "kg", "kg", "kg"],
         },
         },
         "drop_rows": [0, 1, 2],
         "drop_rows": [0, 1, 2],
-         "category_col": "Greenhouse Gas Source and Sink Categories",
+        "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2018,
         "year": 2018,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018_fgases",
         "coords_value_mapping": "2018_fgases",
     },
     },
-    '84': {
-        "templates": ['84_1', '84_2'],
+    "84": {
+        "templates": ["84_1", "84_2"],
         "header": {
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         },
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "category_col": "Categories",
         "category_col": "Categories",
@@ -195,11 +281,11 @@ table_defs = {
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
         "coords_value_mapping": "2018",
     },
     },
-    '85': {
-        "templates": ['85_1', '85_2'],
+    "85": {
+        "templates": ["85_1", "85_2"],
         "header": {
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         },
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "category_col": "Categories",
         "category_col": "Categories",
@@ -207,11 +293,11 @@ table_defs = {
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
         "coords_value_mapping": "2018",
     },
     },
-    '92': {
-        "templates": ['92'],
+    "92": {
+        "templates": ["92"],
         "header": {
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC', 'SO2'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC", "SO2"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         },
         "drop_rows": [0, 1, 2],
         "drop_rows": [0, 1, 2],
         "category_col": "Categories",
         "category_col": "Categories",
@@ -219,43 +305,43 @@ table_defs = {
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
         "coords_value_mapping": "2018",
     },
     },
-    '95': {
-        "templates": ['95_1', '95_2'],
+    "95": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2016,
         "year": 2016,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
         "coords_value_mapping": "other",
     },
     },
-    '96': {
-        "templates": ['95_1', '95_2'],
+    "96": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2014,
         "year": 2014,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
         "coords_value_mapping": "other",
     },
     },
-    '97': {
-        "templates": ['95_1', '95_2'],
+    "97": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2012,
         "year": 2012,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
         "coords_value_mapping": "other",
     },
     },
-    '98': {
-        "templates": ['95_1', '95_2'],
+    "98": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2010,
         "year": 2010,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
         "coords_value_mapping": "other",
     },
     },
-    '99': {
-        "templates": ['95_1', '95_2'],
+    "99": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2000,
         "year": 2000,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
         "coords_value_mapping": "other",
     },
     },
-    '100': {
-        "templates": ['95_1', '95_2'],
+    "100": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 1994,
         "year": 1994,
         # "unit_info": unit_info_2018,
         # "unit_info": unit_info_2018,
@@ -264,12 +350,12 @@ table_defs = {
 }
 }
 
 
 cat_names_fix = {
 cat_names_fix = {
-    '14Ab Residential': '1A4b Residential',
+    "14Ab Residential": "1A4b Residential",
 }
 }
 
 
 values_replacement = {
 values_replacement = {
-#    '': '-',
-    ' ': '',
+    #    '': '-',
+    " ": "",
 }
 }
 
 
 gwp_to_use = "AR5GWP100"
 gwp_to_use = "AR5GWP100"
@@ -281,28 +367,28 @@ unit_row = "header"
 
 
 ## parameters part 2: conversion to PRIMAP2 interchnage format
 ## parameters part 2: conversion to PRIMAP2 interchnage format
 
 
-cats_remove = ['Information items']
+cats_remove = ["Information items"]
 
 
 cat_codes_manual = {
 cat_codes_manual = {
-    'CO2 from Biomass Combustion for Energy Production': 'M.BIO',
-    'Total National Emissions and Removals': '0',
-    'Total (Net) National Emissions': '0',
-    'Clinical Waste Incineration': 'M.4.C.1',
-    'Hazardous Waste Incineration': 'M.4.C.2',
+    "CO2 from Biomass Combustion for Energy Production": "M.BIO",
+    "Total National Emissions and Removals": "0",
+    "Total (Net) National Emissions": "0",
+    "Clinical Waste Incineration": "M.4.C.1",
+    "Hazardous Waste Incineration": "M.4.C.2",
     #'3 AGRICULTURE': 'M.AG',
     #'3 AGRICULTURE': 'M.AG',
-    '3 AGRICULTURE, FORESTRY AND OTHER LAND USE': '3',
+    "3 AGRICULTURE, FORESTRY AND OTHER LAND USE": "3",
     #'3 LAND USE, LAND-USE CHANGE AND FORESTRY': 'M.LULUCF',
     #'3 LAND USE, LAND-USE CHANGE AND FORESTRY': 'M.LULUCF',
 }
 }
 
 
 
 
-cat_code_regexp = r'(?P<code>^[A-Za-z0-9]{1,7})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
 
 
 # special header as category code and name in one column
 # special header as category code and name in one column
 header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
 
 coords_terminologies = {
 coords_terminologies = {
     "area": "ISO3",
     "area": "ISO3",
-    "category": "IPCC2006_PRIMAP", #two extra categories
+    "category": "IPCC2006_PRIMAP",  # two extra categories
     "scenario": "PRIMAP",
     "scenario": "PRIMAP",
 }
 }
 
 
@@ -310,63 +396,59 @@ coords_defaults = {
     "source": "SGP-GHG-inventory ",
     "source": "SGP-GHG-inventory ",
     "provenance": "measured",
     "provenance": "measured",
     "area": "SGP",
     "area": "SGP",
-    "scenario": "BUR5"
+    "scenario": "BUR5",
 }
 }
 
 
 coords_value_mapping = {
 coords_value_mapping = {
     "2018": {
     "2018": {
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "entity": {
         "entity": {
-            'HFCs': f'HFCS ({gwp_to_use})',
-            'PFCs': f'PFCS ({gwp_to_use})',
-            'CH4': 'CH4',
-            'N2O': 'N2O',
-            'NF3': f'NF3 ({gwp_to_use})',
-            'Net CO2': 'CO2',
-            'SF6': f'SF6 ({gwp_to_use})',
-            'Total (Net) National Emissions': 'KYOTOGHG (AR5GWP100)',
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "CH4": "CH4",
+            "N2O": "N2O",
+            "NF3": f"NF3 ({gwp_to_use})",
+            "Net CO2": "CO2",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "Total (Net) National Emissions": "KYOTOGHG (AR5GWP100)",
         },
         },
     },
     },
     "2018_fgases": {
     "2018_fgases": {
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "entity": {
         "entity": {
-            'HFC-125': 'HFC125',
-            'HFC-134a': 'HFC134a',
-            'HFC-143a': 'HFC143a',
-            'HFC-152a': 'HFC152a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
-            'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'NF3': 'NF3',
-            'PFC-116': 'C2F6',
-            'PFC-14': 'CF4',
-            'PFC-218': 'C3F8',
-            'PFC-318': 'cC4F8',
-            'SF6': 'SF6',
+            "HFC-125": "HFC125",
+            "HFC-134a": "HFC134a",
+            "HFC-143a": "HFC143a",
+            "HFC-152a": "HFC152a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "NF3": "NF3",
+            "PFC-116": "C2F6",
+            "PFC-14": "CF4",
+            "PFC-218": "C3F8",
+            "PFC-318": "cC4F8",
+            "SF6": "SF6",
         },
         },
     },
     },
     "other": {
     "other": {
         "unit": "PRIMAP1",
         "unit": "PRIMAP1",
         "entity": {
         "entity": {
-            'HFCs': f'HFCS ({gwp_to_use})',
-            'CH4': f'CH4 ({gwp_to_use})',
-            'N2O': f'N2O ({gwp_to_use})',
-            'NF3': f'NF3 ({gwp_to_use})',
-            'Net CO2': 'CO2',
-            'PFCs': f'PFCS ({gwp_to_use})',
-            'SF6': f'SF6 ({gwp_to_use})',
-            'Total (Net) National Emissions': f'KYOTOGHG ({gwp_to_use})',
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "CH4": f"CH4 ({gwp_to_use})",
+            "N2O": f"N2O ({gwp_to_use})",
+            "NF3": f"NF3 ({gwp_to_use})",
+            "Net CO2": "CO2",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "Total (Net) National Emissions": f"KYOTOGHG ({gwp_to_use})",
         },
         },
     },
     },
 }
 }
 
 
-coords_cols = {
-    "category": "category",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
 
 
 add_coords_cols = {
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
     "orig_cat_name": ["orig_cat_name", "category"],
@@ -386,7 +468,7 @@ meta_data = {
     "rights": "",
     "rights": "",
     "contact": "mail@johannes-guetschow.de",
     "contact": "mail@johannes-guetschow.de",
     "title": "Singapore's Fifth National Communication and Fifth Biannial Update "
     "title": "Singapore's Fifth National Communication and Fifth Biannial Update "
-             "Report",
+    "Report",
     "comment": "Read fom pdf file by Johannes Gütschow",
     "comment": "Read fom pdf file by Johannes Gütschow",
     "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
 }
 }
@@ -394,92 +476,165 @@ meta_data = {
 
 
 ## processing
 ## processing
 aggregate_sectors = {
 aggregate_sectors = {
-    '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-          'name': 'IPPU'},
-    'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'], 'name': 'Emissions from Biomass Burning (Agriculture)'},
-    'M.3.C.1.LU': {'sources': ['3.C.1.a', '3.C.1.d'], 'name': 'Emissions from Biomass Burning (LULUCF)'},
-    'M.3.C.AG': {'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                             '3.C.6', '3.C.7', '3.C.8'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    'M.AG': {'sources': ['M.AG.ELV', '3.A'], 'name': 'Agriculture'},
-    'M.LULUCF': {'sources': ['M.3.C.1.LU', '3.B', '3.D'],
-                 'name': 'Land Use, Land Use Change, and Forestry'},
-    'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National Total Excluding LULUCF'},
-    '0': {'sources': ['1', '2', '3', '4', '5'], 'name': 'National Total'},
+    "2": {
+        "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+        "name": "IPPU",
+    },
+    "M.3.C.1.AG": {
+        "sources": ["3.C.1.b", "3.C.1.c"],
+        "name": "Emissions from Biomass Burning (Agriculture)",
+    },
+    "M.3.C.1.LU": {
+        "sources": ["3.C.1.a", "3.C.1.d"],
+        "name": "Emissions from Biomass Burning (LULUCF)",
+    },
+    "M.3.C.AG": {
+        "sources": [
+            "M.3.C.1.AG",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
+            "3.C.5",
+            "3.C.6",
+            "3.C.7",
+            "3.C.8",
+        ],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "M.AG": {"sources": ["M.AG.ELV", "3.A"], "name": "Agriculture"},
+    "M.LULUCF": {
+        "sources": ["M.3.C.1.LU", "3.B", "3.D"],
+        "name": "Land Use, Land Use Change, and Forestry",
+    },
+    "M.0.EL": {
+        "sources": ["1", "2", "M.AG", "4", "5"],
+        "name": "National Total Excluding LULUCF",
+    },
+    "0": {"sources": ["1", "2", "3", "4", "5"], "name": "National Total"},
 }
 }
 
 
 
 
 processing_info_step1 = {
 processing_info_step1 = {
     # aggregate IPPU which is missing for individual fgases so it can be used in the
     # aggregate IPPU which is missing for individual fgases so it can be used in the
     # next step (downscaling)
     # next step (downscaling)
-    'aggregate_cats': {
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-              'name': 'IPPU'},
+    "aggregate_cats": {
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "name": "IPPU",
+        },
     },
     },
-    'tolerance': 1, # because ch4 is inconsistent
+    "tolerance": 1,  # because ch4 is inconsistent
 }
 }
 
 
-processing_info_step2 =  {
-    'aggregate_cats': aggregate_sectors,
-    'downscale': {
-        'sectors': {
-            'IPPU': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.E',
-                                    '2.F', '2.G', '2.H'],
-                'entities': ['CO2', 'N2O', f'PFCS ({gwp_to_use})',
-                             f'HFCS ({gwp_to_use})', 'SF6', 'NF3'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+processing_info_step2 = {
+    "aggregate_cats": aggregate_sectors,
+    "downscale": {
+        "sectors": {
+            "IPPU": {
+                "basket": "2",
+                "basket_contents": [
+                    "2.A",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.E",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                ],
+                "entities": [
+                    "CO2",
+                    "N2O",
+                    f"PFCS ({gwp_to_use})",
+                    f"HFCS ({gwp_to_use})",
+                    "SF6",
+                    "NF3",
+                ],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
             # AFOLU downscaling. Most is zero anyway
             # AFOLU downscaling. Most is zero anyway
-            '3C': {
-                'basket': '3.C',
-                'basket_contents': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                    '3.C.6', '3.C.7', '3.C.8'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3C": {
+                "basket": "3.C",
+                "basket_contents": [
+                    "3.C.1",
+                    "3.C.2",
+                    "3.C.3",
+                    "3.C.4",
+                    "3.C.5",
+                    "3.C.6",
+                    "3.C.7",
+                    "3.C.8",
+                ],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            '3C1': {
-                'basket': '3.C.1',
-                'basket_contents': ['3.C.1.a', '3.C.1.b', '3.C.1.c', '3.C.1.d'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3C1": {
+                "basket": "3.C.1",
+                "basket_contents": ["3.C.1.a", "3.C.1.b", "3.C.1.c", "3.C.1.d"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
-            '3D': {
-                'basket': '3.D',
-                'basket_contents': ['3.D.1', '3.D.2'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3D": {
+                "basket": "3.D",
+                "basket_contents": ["3.D.1", "3.D.2"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             },
         },
         },
-        'entities': {
-            'HFCS': {
-                'basket': f'HFCS ({gwp_to_use})',
-                'basket_contents': ['HFC125', 'HFC134a', 'HFC143a', 'HFC23',
-                                    'HFC32', 'HFC4310mee', 'HFC227ea'],
-                'sel': {'category (IPCC2006_PRIMAP)':
-                            ['0', '2', '2.C', '2.E',
-                             '2.F', '2.G', '2.H']},
+        "entities": {
+            "HFCS": {
+                "basket": f"HFCS ({gwp_to_use})",
+                "basket_contents": [
+                    "HFC125",
+                    "HFC134a",
+                    "HFC143a",
+                    "HFC23",
+                    "HFC32",
+                    "HFC4310mee",
+                    "HFC227ea",
+                ],
+                "sel": {
+                    "category (IPCC2006_PRIMAP)": [
+                        "0",
+                        "2",
+                        "2.C",
+                        "2.E",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                    ]
+                },
             },
             },
-            'PFCS': {
-                'basket': f'PFCS ({gwp_to_use})',
-                'basket_contents': ['C2F6', 'C3F8', 'CF4', 'cC4F8'],
-                'sel': {'category (IPCC2006_PRIMAP)':
-                            ['0', '2', '2.C', '2.E',
-                             '2.F', '2.G', '2.H']},
+            "PFCS": {
+                "basket": f"PFCS ({gwp_to_use})",
+                "basket_contents": ["C2F6", "C3F8", "CF4", "cC4F8"],
+                "sel": {
+                    "category (IPCC2006_PRIMAP)": [
+                        "0",
+                        "2",
+                        "2.C",
+                        "2.E",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                    ]
+                },
             },
             },
-        }
+        },
     },
     },
-    'remove_ts': {
-        'fgases': { # unnecessary and complicates aggregation for
+    "remove_ts": {
+        "fgases": {  # unnecessary and complicates aggregation for
             # other gases
             # other gases
-            'category': ['5', '5.B'],
-            'entities': [f'HFCS ({gwp_to_use})', f'PFCS ({gwp_to_use})', 'SF6', 'NF3'],
+            "category": ["5", "5.B"],
+            "entities": [f"HFCS ({gwp_to_use})", f"PFCS ({gwp_to_use})", "SF6", "NF3"],
         },
         },
-        'CH4': { # inconsistent with IPPU sector
-            'category': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-            'entities': ['CH4'],
+        "CH4": {  # inconsistent with IPPU sector
+            "category": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "entities": ["CH4"],
         },
         },
     },
     },
     # 'basket_copy': {
     # 'basket_copy': {
@@ -488,6 +643,3 @@ processing_info_step2 =  {
     #     'source_GWP': gwp_to_use,
     #     'source_GWP': gwp_to_use,
     # },
     # },
 }
 }
-
-
-

+ 110 - 72
src/unfccc_ghg_data/unfccc_reader/Singapore/read_SGP_BUR5_from_pdf.py

@@ -1,12 +1,26 @@
-# read Singapore fifth BUR from pdf
+"""
+Read Singapore's BUR5 from pdf
 
 
+This script reads data from Singapore's BUR5
+Data are read from pdf using camelot
 
 
+"""
 import locale
 import locale
 
 
-#import numpy as np
+# import numpy as np
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    gas_baskets,
+    process_data_for_country,
+)
+
 from .config_sgp_bur5 import (
 from .config_sgp_bur5 import (
     cat_code_regexp,
     cat_code_regexp,
     cat_codes_manual,
     cat_codes_manual,
@@ -26,29 +40,20 @@ from .config_sgp_bur5 import (
     table_defs,
     table_defs,
     values_replacement,
     values_replacement,
 )
 )
-from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    gas_baskets,
-    process_data_for_country,
-)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     ### genral configuration
     ### genral configuration
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Singapore' / 'BUR5'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Singapore'
+    input_folder = downloaded_data_path / "UNFCCC" / "Singapore" / "BUR5"
+    output_folder = extracted_data_path / "UNFCCC" / "Singapore"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'SGP_BUR5_2022_'
-    inventory_file_pdf = 'Singapore_-_NC5BUR5.pdf'
-    #years_to_read = range(1990, 2018 + 1)
+    output_filename = "SGP_BUR5_2022_"
+    inventory_file_pdf = "Singapore_-_NC5BUR5.pdf"
+    # years_to_read = range(1990, 2018 + 1)
 
 
     # define locale to use for str to float conversion
     # define locale to use for str to float conversion
-    locale_to_use = 'en_SG.UTF-8'
+    locale_to_use = "en_SG.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
 
     pagesToRead = table_defs.keys()
     pagesToRead = table_defs.keys()
@@ -69,9 +74,14 @@ if __name__ == "__main__":
             print(f"Reading table {table_on_page}")
             print(f"Reading table {table_on_page}")
             area = table_def_templates[table_on_page]["area"]
             area = table_def_templates[table_on_page]["area"]
             cols = table_def_templates[table_on_page]["cols"]
             cols = table_def_templates[table_on_page]["cols"]
-            tables = camelot.read_pdf(str(input_folder / inventory_file_pdf),
-                                      pages=str(page), flavor='stream',
-                                      table_areas=area, columns=cols, split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / inventory_file_pdf),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
 
 
             df_current = tables[0].df.copy(deep=True)
             df_current = tables[0].df.copy(deep=True)
             # drop the old header
             # drop the old header
@@ -79,39 +89,52 @@ if __name__ == "__main__":
                 df_current = df_current.drop(table_defs[page]["drop_rows"])
                 df_current = df_current.drop(table_defs[page]["drop_rows"])
             elif "drop_rows" in table_def_templates[table_on_page].keys():
             elif "drop_rows" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(
                 df_current = df_current.drop(
-                    table_def_templates[table_on_page]["drop_rows"])
+                    table_def_templates[table_on_page]["drop_rows"]
+                )
             # add new header
             # add new header
-            if 'header' in table_defs[page].keys():
+            if "header" in table_defs[page].keys():
                 df_current.columns = pd.MultiIndex.from_tuples(
                 df_current.columns = pd.MultiIndex.from_tuples(
-                    zip(table_defs[page]['header']['entity'],
-                        table_defs[page]['header']['unit']))
+                    zip(
+                        table_defs[page]["header"]["entity"],
+                        table_defs[page]["header"]["unit"],
+                    )
+                )
             else:
             else:
                 df_current.columns = pd.MultiIndex.from_tuples(
                 df_current.columns = pd.MultiIndex.from_tuples(
-                    zip(table_def_templates[table_on_page]['header']['entity'],
-                        table_def_templates[table_on_page]['header']['unit']))
+                    zip(
+                        table_def_templates[table_on_page]["header"]["entity"],
+                        table_def_templates[table_on_page]["header"]["unit"],
+                    )
+                )
 
 
             # drop cols if necessary
             # drop cols if necessary
             if "drop_cols" in table_defs[page].keys():
             if "drop_cols" in table_defs[page].keys():
-                # print(df_current.columns.values)
+                # print(df_current.columns.to_numpy())
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
             elif "drop_cols" in table_def_templates[table_on_page].keys():
             elif "drop_cols" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
 
 
             # rename category column
             # rename category column
-            df_current.rename(columns={table_defs[page]["category_col"]: index_cols[0]},
-                              inplace=True)
+            df_current = df_current.rename(
+                columns={table_defs[page]["category_col"]: index_cols[0]}
+            )
 
 
             # replace double \n
             # replace double \n
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             # replace double and triple spaces
             # replace double and triple spaces
-            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace(
+                "   ", " "
+            )
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
 
             # fix the split rows
             # fix the split rows
             for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
             for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[table_on_page]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[table_on_page]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
 
             # replace category names with typos
             # replace category names with typos
             df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
             df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
@@ -122,7 +145,7 @@ if __name__ == "__main__":
             # set index
             # set index
             # df_current = df_current.set_index(index_cols)
             # df_current = df_current.set_index(index_cols)
             # strip trailing and leading  and remove "^"
             # strip trailing and leading  and remove "^"
-            for col in df_current.columns.values:
+            for col in df_current.columns.to_numpy():
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.replace("^", "")
                 df_current[col] = df_current[col].str.replace("^", "")
 
 
@@ -132,19 +155,24 @@ if __name__ == "__main__":
                 df_this_page = df_current.copy(deep=True)
                 df_this_page = df_current.copy(deep=True)
             else:
             else:
                 # find intersecting cols
                 # find intersecting cols
-                cols_this_page = df_this_page.columns.values
+                cols_this_page = df_this_page.columns.to_numpy()
                 # print(f"cols this page: {cols_this_page}")
                 # print(f"cols this page: {cols_this_page}")
-                cols_current = df_current.columns.values
+                cols_current = df_current.columns.to_numpy()
                 # print(f"cols current: {cols_current}")
                 # print(f"cols current: {cols_current}")
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 # print(f"cols both: {cols_both}")
                 # print(f"cols both: {cols_both}")
                 if len(cols_both) > 0:
                 if len(cols_both) > 0:
-                    df_this_page = df_this_page.merge(df_current, how='outer', on=cols_both,
-                                                      suffixes=(None, None))
+                    df_this_page = df_this_page.merge(
+                        df_current, how="outer", on=cols_both, suffixes=(None, None)
+                    )
                 else:
                 else:
-                    df_this_page = df_this_page.merge(df_current, how='outer',
-                                                      left_index=True, right_index=True,
-                                                      suffixes=(None, None))
+                    df_this_page = df_this_page.merge(
+                        df_current,
+                        how="outer",
+                        left_index=True,
+                        right_index=True,
+                        suffixes=(None, None),
+                    )
 
 
                 df_this_page = df_this_page.groupby(index_cols).first().reset_index()
                 df_this_page = df_this_page.groupby(index_cols).first().reset_index()
                 # print(df_this_page)
                 # print(df_this_page)
@@ -152,28 +180,34 @@ if __name__ == "__main__":
 
 
         # set index and convert to long format
         # set index and convert to long format
         df_this_page = df_this_page.set_index(index_cols)
         df_this_page = df_this_page.set_index(index_cols)
-        df_this_page_long = pm2.pm2io.nir_convert_df_to_long(df_this_page,
-                                                             table_defs[page]["year"],
-                                                             header_long)
+        df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_page, table_defs[page]["year"], header_long
+        )
 
 
         # drop the rows with memo items etc
         # drop the rows with memo items etc
         for cat in cats_remove:
         for cat in cats_remove:
             df_this_page_long = df_this_page_long.drop(
             df_this_page_long = df_this_page_long.drop(
-                df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index)
+                df_this_page_long.loc[
+                    df_this_page_long.loc[:, index_cols[0]] == cat
+                ].index
+            )
 
 
         # make a copy of the categories row
         # make a copy of the categories row
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
 
 
         # replace cat names by codes in col "Categories"
         # replace cat names by codes in col "Categories"
         # first the manual replacements
         # first the manual replacements
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
-            cat_codes_manual)
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].replace(cat_codes_manual)
+
         # then the regex repalcements
         # then the regex repalcements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:,
-                                               "category"].str.replace(cat_code_regexp,
-                                                                       repl, regex=True)
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].str.replace(cat_code_regexp, repl, regex=True)
         df_this_page_long.loc[:, "category"].unique()
         df_this_page_long.loc[:, "category"].unique()
 
 
         # strip spaces in data col
         # strip spaces in data col
@@ -185,27 +219,29 @@ if __name__ == "__main__":
         df_this_page_long.columns = df_this_page_long.columns.map(str)
         df_this_page_long.columns = df_this_page_long.columns.map(str)
 
 
         # remove thousands separators as pd.to_numeric can't deal with that
         # remove thousands separators as pd.to_numeric can't deal with that
-        df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(',',
-                                                                                        '')
+        df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+            ",", ""
+        )
 
 
         # drop orig cat name as it's not unique over all tables (keep until here in case
         # drop orig cat name as it's not unique over all tables (keep until here in case
         # it's needed for debugging)
         # it's needed for debugging)
-        df_this_page_long = df_this_page_long.drop(columns='orig_cat_name')
+        df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
 
 
         data_page_if = pm2.pm2io.convert_long_dataframe_if(
         data_page_if = pm2.pm2io.convert_long_dataframe_if(
             df_this_page_long,
             df_this_page_long,
             coords_cols=coords_cols,
             coords_cols=coords_cols,
-            #add_coords_cols=add_coords_cols,
+            # add_coords_cols=add_coords_cols,
             coords_defaults=coords_defaults,
             coords_defaults=coords_defaults,
             coords_terminologies=coords_terminologies,
             coords_terminologies=coords_terminologies,
             coords_value_mapping=coords_value_mapping[
             coords_value_mapping=coords_value_mapping[
-                table_defs[page]["coords_value_mapping"]],
+                table_defs[page]["coords_value_mapping"]
+            ],
             # coords_value_filling=coords_value_filling,
             # coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             filter_remove=filter_remove,
             # filter_keep=filter_keep,
             # filter_keep=filter_keep,
             meta_data=meta_data,
             meta_data=meta_data,
             convert_str=True,
             convert_str=True,
-            time_format='%Y',
+            time_format="%Y",
         )
         )
 
 
         # conversion to PRIMAP2 native format
         # conversion to PRIMAP2 native format
@@ -226,13 +262,16 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw"), data_if)
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
 
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     #### processing
     #### processing
     data_proc_pm2 = data_pm2
     data_proc_pm2 = data_pm2
@@ -246,22 +285,21 @@ if __name__ == "__main__":
         processing_info_country=processing_info_step1,
         processing_info_country=processing_info_step1,
     )
     )
 
 
-
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         data_proc_pm2,
         entities_to_ignore=[],
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         gas_baskets=gas_baskets,
         processing_info_country=processing_info_step2,
         processing_info_country=processing_info_step2,
-        cat_terminology_out = terminology_proc,
-        #category_conversion = None,
-        #sectors_out = None,
+        cat_terminology_out=terminology_proc,
+        # category_conversion = None,
+        # sectors_out = None,
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
     # TODO: processing info is present twice
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -270,10 +308,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
-
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Taiwan/__init__.py

@@ -0,0 +1,30 @@
+"""Read Taiwan's inventories
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'TWN'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=TWN
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 194 - 120
src/unfccc_ghg_data/unfccc_reader/Taiwan/config_twn_nir2022.py

@@ -1,4 +1,10 @@
-# config and functions for Taiwan NIR 2022
+"""Config for Taiwan's 2022 inventory
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 
 
 from typing import Union
 from typing import Union
 
 
@@ -6,9 +12,36 @@ import pandas as pd
 
 
 gwp_to_use = "AR4GWP100"
 gwp_to_use = "AR4GWP100"
 
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
+
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
+    Combine split rows
+
+    This function combines rows which have been split into several rows during data
+    reading from pdf because they contained line breaks.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        The data to work with
+    rows_to_fix: list
+        List of values for which to fix rows
+    col_to_use: str
+        column to use to find the rows to merge
+    n_rows: int
+        How many rows to combine for each row found. e.g. 3 means combine the found
+        row with the following two rows. Negative values are used for more
+        complicated situations where the rows to merge are also before the position
+        of the value that indicates the merge. See code for details
+
+    Returns
+    -------
+        pandas DataFrame with combined rows. The individual rows are removed
+    """
     for row in rows_to_fix:
     for row in rows_to_fix:
-        #print(row)
+        # print(row)
         # find the row number and collect the row and the next two rows
         # find the row number and collect the row and the next two rows
         index = data.loc[data[col_to_use] == row].index
         index = data.loc[data[col_to_use] == row].index
         if not list(index):
         if not list(index):
@@ -20,35 +53,35 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         for item in index:
         for item in index:
             loc = data.index.get_loc(item)
             loc = data.index.get_loc(item)
             ####print(data[col_to_use].loc[loc + 1])
             ####print(data[col_to_use].loc[loc + 1])
-            if n_rows == -2:
+            if n_rows == -2:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 loc_to_check = loc - 1
                 loc_to_check = loc - 1
-            #if n_rows == -3:
+            # if n_rows == -3:
             #    locs_to_merge = list(range(loc - 1, loc + 2))
             #    locs_to_merge = list(range(loc - 1, loc + 2))
-            #elif n_rows == -5:
+            # elif n_rows == -5:
             #    locs_to_merge = list(range(loc - 1, loc + 4))
             #    locs_to_merge = list(range(loc - 1, loc + 4))
             else:
             else:
                 locs_to_merge = list(range(loc, loc + n_rows))
                 locs_to_merge = list(range(loc, loc + n_rows))
                 loc_to_check = loc + 1
                 loc_to_check = loc + 1
 
 
-            if data[col_to_use].loc[loc_to_check] == '':
+            if not data[col_to_use].loc[loc_to_check]:
                 rows_to_merge = data.iloc[locs_to_merge]
                 rows_to_merge = data.iloc[locs_to_merge]
                 indices_to_merge = rows_to_merge.index
                 indices_to_merge = rows_to_merge.index
                 # replace numerical NaN values
                 # replace numerical NaN values
                 ####print(rows_to_merge)
                 ####print(rows_to_merge)
-                rows_to_merge = rows_to_merge.fillna('')
+                rows_to_merge = rows_to_merge.fillna("")
                 ####print("fillna")
                 ####print("fillna")
                 ####print(rows_to_merge)
                 ####print(rows_to_merge)
                 # join the three rows
                 # join the three rows
-                new_row = rows_to_merge.agg(' '.join)
+                new_row = rows_to_merge.agg(" ".join)
                 # replace the double spaces that are created
                 # replace the double spaces that are created
                 # must be done here and not at the end as splits are not always
                 # must be done here and not at the end as splits are not always
                 # the same and join would produce different col values
                 # the same and join would produce different col values
                 new_row = new_row.str.replace("  ", " ")
                 new_row = new_row.str.replace("  ", " ")
                 new_row = new_row.str.strip()
                 new_row = new_row.str.strip()
-                #new_row = new_row.str.replace("N O", "NO")
-                #new_row = new_row.str.replace(", N", ",N")
-                #new_row = new_row.str.replace("- ", "-")
+                # new_row = new_row.str.replace("N O", "NO")
+                # new_row = new_row.str.replace(", N", ",N")
+                # new_row = new_row.str.replace("- ", "-")
                 data.loc[indices_to_merge[0]] = new_row
                 data.loc[indices_to_merge[0]] = new_row
                 indices_to_drop = indices_to_drop + list(indices_to_merge[1:])
                 indices_to_drop = indices_to_drop + list(indices_to_merge[1:])
 
 
@@ -56,12 +89,43 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         data = data.reset_index(drop=True)
         data = data.reset_index(drop=True)
     return data
     return data
 
 
-def make_wide_table(data: pd.DataFrame, keyword: str, col: Union[int, str], index_cols: list[Union[int, str]])->pd.DataFrame:
+
+def make_wide_table(
+    data: pd.DataFrame,
+    keyword: str,
+    col: Union[int, str],
+    index_cols: list[Union[int, str]],
+) -> pd.DataFrame:
+    """
+    Transform a table with sections for gases to a gas-wide table
+
+    Some tables are rolled up, i.e. the header repeats within the table and the
+    tables are composed of several tables for different year ranges stacked on top of
+    each other. These tables are unrolled and converted to a proper time-wide format
+    without repetition of headers.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        the data to convert
+    keyword: str
+        The keyword used to identify the header, e.g. 'GHG Emission Sources and Sinks'
+    col: int
+        Column to look for the keyword
+    index_cols: list[Union[int, str]]
+        Columns to use as index for the output DataFrame
+
+
+    Returns
+    -------
+        pandas DataFrame in time-wide format
+
+    """
     index = data.loc[data[col] == keyword].index
     index = data.loc[data[col] == keyword].index
     if not list(index):
     if not list(index):
         print("Keyword for table transformation not found")
         print("Keyword for table transformation not found")
         return data
         return data
-    elif len(index)==1:
+    elif len(index) == 1:
         print("Keyword for table transformation found only once")
         print("Keyword for table transformation found only once")
         return data
         return data
     else:
     else:
@@ -74,83 +138,88 @@ def make_wide_table(data: pd.DataFrame, keyword: str, col: Union[int, str], inde
                 next_loc = data.index[-1] + 1
                 next_loc = data.index[-1] + 1
             df_to_add = data.loc[list(range(loc, next_loc))]
             df_to_add = data.loc[list(range(loc, next_loc))]
             # select only cols which don't have NaN, Null, or '' as header
             # select only cols which don't have NaN, Null, or '' as header
-            filter_nan = ((~df_to_add.iloc[0].isnull()) & (df_to_add.iloc[0] != 'NaN')& (df_to_add.iloc[0] != ''))
-            df_to_add = df_to_add.loc[: , filter_nan]
+            filter_nan = (
+                (~df_to_add.iloc[0].isna())
+                & (df_to_add.iloc[0] != "NaN")
+                & (df_to_add.iloc[0])
+            )
+            df_to_add = df_to_add.loc[:, filter_nan]
             df_to_add.columns = df_to_add.iloc[0]
             df_to_add.columns = df_to_add.iloc[0]
-            #print(df_to_add.columns)
+            # print(df_to_add.columns)
             df_to_add = df_to_add.drop(loc)
             df_to_add = df_to_add.drop(loc)
             df_to_add = df_to_add.set_index(index_cols)
             df_to_add = df_to_add.set_index(index_cols)
 
 
             if df_all is None:
             if df_all is None:
                 df_all = df_to_add
                 df_all = df_to_add
             else:
             else:
-                df_all = pd.concat([df_all, df_to_add], axis=1, join='outer')
+                df_all = pd.concat([df_all, df_to_add], axis=1, join="outer")
         return df_all
         return df_all
 
 
 
 
 # page defs tp hold information on reading the table
 # page defs tp hold information on reading the table
 page_defs = {
 page_defs = {
-    '5': {
-        "table_areas": ['36,523,563,68'],
+    "5": {
+        "table_areas": ["36,523,563,68"],
         "split_text": False,
         "split_text": False,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '6': {
-        "table_areas": ['34,562,563,53'],
-        #"columns": ['195,228,263,295,328,363,395,428,462,495,529'], # works without
+    "6": {
+        "table_areas": ["34,562,563,53"],
+        # "columns": ['195,228,263,295,328,363,395,428,462,495,529'], # works without
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '7': {
-        "table_areas": ['36,740,499,482', '36,430,564,53'],
+    "7": {
+        "table_areas": ["36,740,499,482", "36,430,564,53"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '8': {
-        "table_areas": ['35,748,503,567'],
+    "8": {
+        "table_areas": ["35,748,503,567"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '9': {
-        "table_areas": ['35,747,565,315', '36,273,565,50'],
+    "9": {
+        "table_areas": ["35,747,565,315", "36,273,565,50"],
         "split_text": False,
         "split_text": False,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '11': {
-        "table_areas": ['35,744,563,434'],
+    "11": {
+        "table_areas": ["35,744,563,434"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '12': {
-        "table_areas": ['33,747,562,86'],
+    "12": {
+        "table_areas": ["33,747,562,86"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '13': {
-        "table_areas": ['34,303,564,54'],
+    "13": {
+        "table_areas": ["34,303,564,54"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '14': {
-        "table_areas": ['34,754,564,256'],
-        "columns": ['220,251,283,314,344,371,406,438,470,500,530'],
+    "14": {
+        "table_areas": ["34,754,564,256"],
+        "columns": ["220,251,283,314,344,371,406,438,470,500,530"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '15': {
-        "table_areas": ['34,487,564,42'],
+    "15": {
+        "table_areas": ["34,487,564,42"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
-    '16': {
-        "table_areas": ['34,418,564,125'],
-        #"columns": ['107,209,241,273,306,338,369,402,433,466,498,533'],
+    "16": {
+        "table_areas": ["34,418,564,125"],
+        # "columns": ['107,209,241,273,306,338,369,402,433,466,498,533'],
         "split_text": True,
         "split_text": True,
         "flavor": "lattice",
         "flavor": "lattice",
-    }, # with stream the row index is messed up with lattice the column index ... red with lattice and fix col header manualy
-    '17': {
-        "table_areas": ['34,534,564,49'],
-        "columns": ['188,232,263,298,331,362,398,432,464,497,530'],
+    },  # with stream the row index is messed up with lattice the column index ...
+    # read with lattice and fix col header manually
+    "17": {
+        "table_areas": ["34,534,564,49"],
+        "columns": ["188,232,263,298,331,362,398,432,464,497,530"],
         "split_text": True,
         "split_text": True,
         "flavor": "stream",
         "flavor": "stream",
     },
     },
@@ -158,38 +227,40 @@ page_defs = {
 
 
 # table defs to hold information on how to process the tables
 # table defs to hold information on how to process the tables
 table_defs = {
 table_defs = {
-    'ES2.2': { # 1990-2020 Carbon Dioxide Emissions and Sequestration in Taiwan
+    "ES2.2": {  # 1990-2020 Carbon Dioxide Emissions and Sequestration in Taiwan
         "tables": [1, 2],
         "tables": [1, 2],
         "rows_to_fix": {
         "rows_to_fix": {
             0: {
             0: {
-                3: ['1.A.4.c Agriculture, Forestry, Fishery, and',
-                    '2.D Non-Energy Products from Fuels and',
-                    '4. Land Use, Land Use Change and Forestry'],
+                3: [
+                    "1.A.4.c Agriculture, Forestry, Fishery, and",
+                    "2.D Non-Energy Products from Fuels and",
+                    "4. Land Use, Land Use Change and Forestry",
+                ],
             },
             },
         },
         },
-        "index_cols": ['GHG Emission Source and Sinks'],
-        "wide_keyword": 'GHG Emission Source and Sinks',
+        "index_cols": ["GHG Emission Source and Sinks"],
+        "wide_keyword": "GHG Emission Source and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "entity": "CO2",
         "entity": "CO2",
         "unit": "kt",
         "unit": "kt",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Net GHG Emission (including LULUCF)': '0',
-            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+            "Net GHG Emission (including LULUCF)": "0",
+            "Total GHG Emission (excluding LULUCF)": "M.0.EL",
         },
         },
     },
     },
-    'ES2.3': { # 1990-2020 Methane Emissions in Taiwan
+    "ES2.3": {  # 1990-2020 Methane Emissions in Taiwan
         "tables": [3, 4],
         "tables": [3, 4],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "entity": f"CH4 ({gwp_to_use})",
         "entity": f"CH4 ({gwp_to_use})",
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total Methane Emissions': '0',
+            "Total Methane Emissions": "0",
         },
         },
     },
     },
-    'ES2.4': { # 1990-2020 Nitrous Oxide Emissions in Taiwan
+    "ES2.4": {  # 1990-2020 Nitrous Oxide Emissions in Taiwan
         "tables": [5],
         "tables": [5],
         "fix_cats": {
         "fix_cats": {
             0: {
             0: {
@@ -197,33 +268,33 @@ table_defs = {
             },
             },
         },
         },
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "entity": f"N2O ({gwp_to_use})",
         "entity": f"N2O ({gwp_to_use})",
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total Nitrous Oxide Emissions': '0',
+            "Total Nitrous Oxide Emissions": "0",
         },
         },
     },
     },
-    'ES3.1': { # 1990-2020 Greenhouse Gas Emission in Taiwan by Sector
+    "ES3.1": {  # 1990-2020 Greenhouse Gas Emission in Taiwan by Sector
         "tables": [7],
         "tables": [7],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "entity": f"KYOTOGHG ({gwp_to_use})",
         "entity": f"KYOTOGHG ({gwp_to_use})",
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Net GHG Emission (including LULUCF)': '0',
-            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+            "Net GHG Emission (including LULUCF)": "0",
+            "Total GHG Emission (excluding LULUCF)": "M.0.EL",
         },
         },
     },
     },
-    'ES3.2': { # 1990-2020 Greenhouse Gas Emissions Produced by Energy Sector in Taiwan
+    "ES3.2": {  # 1990-2020 Greenhouse Gas Emissions Produced by Energy Sector in Taiwan
         "tables": [8],
         "tables": [8],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "gas_splitting": {
         "gas_splitting": {
             "Total CO2 Emission": "CO2",
             "Total CO2 Emission": "CO2",
@@ -234,17 +305,18 @@ table_defs = {
         },
         },
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total CO2 Emission': '1',
-            'Total CH4 Emission': '1',
-            'Total N2O Emission': '1',
-            'Total Emission from Energy Sector': '1',
+            "Total CO2 Emission": "1",
+            "Total CH4 Emission": "1",
+            "Total N2O Emission": "1",
+            "Total Emission from Energy Sector": "1",
         },
         },
     },
     },
-    'ES3.3': { # 1990-2020 Greenhouse Gas Emissions Produced by Industrial Process and Product Use Sector (IPPU) in Taiwan
-        "tables": [9,10],
+    "ES3.3": {  # 1990-2020 Greenhouse Gas Emissions Produced by Industrial
+        # Process and Product Use Sector (IPPU) in Taiwan
+        "tables": [9, 10],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "gas_splitting": {
         "gas_splitting": {
             "Total CO2 Emission": "CO2",
             "Total CO2 Emission": "CO2",
@@ -259,24 +331,26 @@ table_defs = {
         },
         },
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total CO2 Emission': '2',
-            'Total CH4 Emission': '2',
-            'Total N2O Emission': '2',
-            'Total HFCs Emission': '2',
-            'Total PFCs Emission (2.E Electronics Industry)': '2.E',
-            'Total SF6 Emission': '2',
-            'Total NF3 Emission (2.E Electronics Industry)': '2.E',
-            'Total Emission from IPPU Sector': '2',
+            "Total CO2 Emission": "2",
+            "Total CH4 Emission": "2",
+            "Total N2O Emission": "2",
+            "Total HFCs Emission": "2",
+            "Total PFCs Emission (2.E Electronics Industry)": "2.E",
+            "Total SF6 Emission": "2",
+            "Total NF3 Emission (2.E Electronics Industry)": "2.E",
+            "Total Emission from IPPU Sector": "2",
         },
         },
         "drop_rows": [
         "drop_rows": [
-            ("2.D Non-Energy Products from Fuels and Solvent Use", "CO2"), # has lower significant digits than in table ES2.2
-        ]
+            ("2.D Non-Energy Products from Fuels and Solvent Use", "CO2"),  # has lower
+            # significant digits than in table ES2.2
+        ],
     },
     },
-    'ES3.4': { # 1990-2020 Greenhouse Gas Emissions Produced by Agriculture Sector in Taiwan
+    "ES3.4": {  # 1990-2020 Greenhouse Gas Emissions Produced by Agriculture Sector
+        # in Taiwan
         "tables": [11],
         "tables": [11],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "gas_splitting": {
         "gas_splitting": {
             "Total CO2 Emission (3.H Urea applied)": "CO2",
             "Total CO2 Emission (3.H Urea applied)": "CO2",
@@ -287,22 +361,22 @@ table_defs = {
         },
         },
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total CO2 Emission (3.H Urea applied)': '3.H',
-            'Total CH4 Emission': '3',
-            'Total N2O Emission': '3',
-            'Total Emission From Agriculture Sector': '3',
+            "Total CO2 Emission (3.H Urea applied)": "3.H",
+            "Total CH4 Emission": "3",
+            "Total N2O Emission": "3",
+            "Total Emission From Agriculture Sector": "3",
         },
         },
     },
     },
-    'ES3.6': { # 1990-2020 Greenhouse Gas Emissions in Taiwan by Waste Sector
+    "ES3.6": {  # 1990-2020 Greenhouse Gas Emissions in Taiwan by Waste Sector
         "tables": [13],
         "tables": [13],
         "rows_to_fix": {
         "rows_to_fix": {
             0: {
             0: {
                 3: ["Total CO2 Emission"],
                 3: ["Total CO2 Emission"],
             },
             },
         },
         },
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
-        "col_wide_kwd": 0, # two column header
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
+        "col_wide_kwd": 0,  # two column header
         "gas_splitting": {
         "gas_splitting": {
             "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "CO2",
             "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "CO2",
             "Total CH4 Emission": f"CH4 ({gwp_to_use})",
             "Total CH4 Emission": f"CH4 ({gwp_to_use})",
@@ -312,51 +386,51 @@ table_defs = {
         },
         },
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
         "cat_codes_manual": {
-            'Total CO2 Emission (5.C Incineration and Open Burning of Waste)': '5.C',
-            'Total CH4 Emission': '5',
-            'Total N2O Emission': '5',
-            'Total Emission from Waste Sector': '5',
+            "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "5.C",
+            "Total CH4 Emission": "5",
+            "Total N2O Emission": "5",
+            "Total Emission from Waste Sector": "5",
         },
         },
     },
     },
 }
 }
 
 
 table_defs_skip = {
 table_defs_skip = {
-    'ES2.1': { # 1990-2020 Greenhouse Gas Emissions and Sequestration in Taiwan by Type
+    "ES2.1": {  # 1990-2020 Greenhouse Gas Emissions and Sequestration in Taiwan by Type
         "tables": [0],
         "tables": [0],
         "rows_to_fix": {
         "rows_to_fix": {
             0: {
             0: {
-                3: ['CO2'],
+                3: ["CO2"],
             },
             },
             1: {  # wherte col 0 is empty
             1: {  # wherte col 0 is empty
-                3: ['Net GHG Emission', 'Total GHG Emission'],
+                3: ["Net GHG Emission", "Total GHG Emission"],
             },
             },
         },
         },
-        "index_cols": ['GHG', 'GWP'],
-        "wide_keyword": 'GHG',
+        "index_cols": ["GHG", "GWP"],
+        "wide_keyword": "GHG",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
     },
     },
-    'ES2.5': { # 1990-2020 Fluoride-Containing Gas Emissions in Taiwan
+    "ES2.5": {  # 1990-2020 Fluoride-Containing Gas Emissions in Taiwan
         "tables": [6],
         "tables": [6],
         "rows_to_fix": {
         "rows_to_fix": {
             0: {
             0: {
-                -2: ['Total SF6 Emissions',
-                     'Total NF3 Emissions'],
+                -2: ["Total SF6 Emissions", "Total NF3 Emissions"],
             },
             },
         },
         },
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "col_wide_kwd": 0,
-        #"entity": "CO2",
+        # "entity": "CO2",
         "unit": "ktCO2eq",
         "unit": "ktCO2eq",
     },
     },
-    'ES3.5': { # skip for now: 1990-2020 Changes in Carbon Sequestration by LULUCF Sector in Taiwan2],
+    "ES3.5": {  # skip for now: 1990-2020 Changes in Carbon Sequestration by LULUCF
+        # Sector in Taiwan2],
         "tables": [12],
         "tables": [12],
         "rows_to_fix": {},
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'], #header is merged col :-(
-        "wide_keyword": 'GHG Emission Sources and Sinks',
-        "col_wide_kwd": 0, # two column header
+        "index_cols": ["GHG Emission Sources and Sinks"],  # header is merged col :-(
+        "wide_keyword": "GHG Emission Sources and Sinks",
+        "col_wide_kwd": 0,  # two column header
         "unit": "kt",
         "unit": "kt",
         "entity": "CO2",
         "entity": "CO2",
-    }, # need to consider the two columns specially (merge?)
+    },  # need to consider the two columns specially (merge?)
 }
 }

+ 164 - 104
src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py

@@ -1,14 +1,24 @@
-# this script reads data from Taiwan's 2022 national inventory
-# Data is read from the english summary pdf
-# TODO: add further GWPs and gas baskets
+"""
+Read Taiwan's 2022 national inventory from pdf
+
+This script reads data from Taiwan's 2022 national inventory
+Data are read from the english summary pdf
+TODO: add further GWPs and gas baskets
+
+"""
 
 
 import copy
 import copy
 
 
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_twn_nir2022 import (fix_rows, gwp_to_use, make_wide_table, page_defs,
-                                 table_defs)
+from config_twn_nir2022 import (
+    fix_rows,
+    gwp_to_use,
+    make_wide_table,
+    page_defs,
+    table_defs,
+)
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -17,16 +27,16 @@ if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
+    input_folder = downloaded_data_path / "non-UNFCCC" / "Taiwan"
     # TODO: move file to subfolder
     # TODO: move file to subfolder
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
+    output_folder = extracted_data_path / "non-UNFCCC" / "Taiwan"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    output_filename = 'TWN_inventory_2022_'
-    inventory_file = '00_abstract_en.pdf'
+    output_filename = "TWN_inventory_2022_"
+    inventory_file = "00_abstract_en.pdf"
 
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,7})\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,7})\s.*"
 
 
     time_format = "%Y"
     time_format = "%Y"
 
 
@@ -79,42 +89,49 @@ if __name__ == "__main__":
     # config for part3: mapping to 2006 categpries
     # config for part3: mapping to 2006 categpries
 
 
     cat_mapping = {
     cat_mapping = {
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': '3.C.7',
-        '3.D': 'M.3.AS',
-        '3.F': '3.C.1.b',
-        '3.H': '3.C.3',
-        '4': 'M.LULUCF',
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        '5.D.1': '4.D.1',
-        '5.D.2': '4.D.2',
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "3.C.7",
+        "3.D": "M.3.AS",
+        "3.F": "3.C.1.b",
+        "3.H": "3.C.3",
+        "4": "M.LULUCF",
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "5.D.1": "4.D.1",
+        "5.D.2": "4.D.2",
     }
     }
 
 
     aggregate_cats = {
     aggregate_cats = {
-        '1.A': {'sources': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'name': 'Fuel Combustion Activities'},
-        '1.B': {'sources': ['1.B.1', '1.B.2'], 'name': 'Fugitive Emissions from Fuels'},
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
-        '3.C.5': {'sources': ['3.C.5.a', '3.C.5.b'],
-                  'name': 'Indirect N2O Emissions from Managed Soils'},
-        '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.AS', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-        'M.AG.ELV': {'sources': ['3.C'],
-                     'name': 'Agriculture excluding livestock emissions'},
+        "1.A": {
+            "sources": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+            "name": "Fuel Combustion Activities",
+        },
+        "1.B": {"sources": ["1.B.1", "1.B.2"], "name": "Fugitive Emissions from Fuels"},
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {"sources": ["3.C.1.b"], "name": "Emissions from Biomass Burning"},
+        "3.C.5": {
+            "sources": ["3.C.5.a", "3.C.5.b"],
+            "name": "Indirect N2O Emissions from Managed Soils",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.3", "M.3.AS", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+        "M.AG.ELV": {
+            "sources": ["3.C"],
+            "name": "Agriculture excluding livestock emissions",
+        },
     }
     }
 
 
-
     # 2 for NF3, PFCs (from 2.E)
     # 2 for NF3, PFCs (from 2.E)
     aggregate_cats_NF3_PFC = {
     aggregate_cats_NF3_PFC = {
-        '2': {'sources': ['2.E'], 'name': 'Industrial Process and Product Use Sector'},
+        "2": {"sources": ["2.E"], "name": "Industrial Process and Product Use Sector"},
     }
     }
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
@@ -130,11 +147,10 @@ if __name__ == "__main__":
             str(input_folder / inventory_file),
             str(input_folder / inventory_file),
             pages=page,
             pages=page,
             **page_defs[page],
             **page_defs[page],
-            )
+        )
         for table in new_tables:
         for table in new_tables:
             all_tables.append(table.df)
             all_tables.append(table.df)
 
 
-
     # ###
     # ###
     # convert tables to primap2 format
     # convert tables to primap2 format
     # ###
     # ###
@@ -148,39 +164,49 @@ if __name__ == "__main__":
         if len(table_def["tables"]) > 1:
         if len(table_def["tables"]) > 1:
             for table in table_def["tables"][1:]:
             for table in table_def["tables"][1:]:
                 df_this_table = pd.concat(
                 df_this_table = pd.concat(
-                    [df_this_table, all_tables[table]],
-                    axis=0,
-                    join='outer')
+                    [df_this_table, all_tables[table]], axis=0, join="outer"
+                )
 
 
         # fix for table ES3.6
         # fix for table ES3.6
-        if table_name == 'ES3.6':
+        if table_name == "ES3.6":
             col_idx = df_this_table[0] == "Total CO Emission"
             col_idx = df_this_table[0] == "Total CO Emission"
-            df_this_table.loc[col_idx, 1:] = ''
-            df_this_table.loc[col_idx, 0] = 'Total CO2 Emission'
+            df_this_table.loc[col_idx, 1:] = ""
+            df_this_table.loc[col_idx, 0] = "Total CO2 Emission"
 
 
         df_this_table = df_this_table.reset_index(drop=True)
         df_this_table = df_this_table.reset_index(drop=True)
 
 
         # fix categories if necessary
         # fix categories if necessary
         if "fix_cats" in table_def.keys():
         if "fix_cats" in table_def.keys():
             for col in table_def["fix_cats"]:
             for col in table_def["fix_cats"]:
-                df_this_table[col] = df_this_table[col].replace(table_def["fix_cats"][col])
+                df_this_table[col] = df_this_table[col].replace(
+                    table_def["fix_cats"][col]
+                )
 
 
         # fix rows
         # fix rows
         for col in table_def["rows_to_fix"].keys():
         for col in table_def["rows_to_fix"].keys():
             for n_rows in table_def["rows_to_fix"][col].keys():
             for n_rows in table_def["rows_to_fix"][col].keys():
                 print(f"Fixing {col}, {n_rows}")
                 print(f"Fixing {col}, {n_rows}")
                 # replace line breaks, long hyphens, double, and triple spaces in category names
                 # replace line breaks, long hyphens, double, and triple spaces in category names
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
-                df_this_table = fix_rows(df_this_table,
-                                         table_def["rows_to_fix"][col][n_rows], col, n_rows)
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "\n", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "   ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "  ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "-", "-"
+                )
+                df_this_table = fix_rows(
+                    df_this_table, table_def["rows_to_fix"][col][n_rows], col, n_rows
+                )
 
 
         # split by entity
         # split by entity
         if "gas_splitting" in table_def.keys():
         if "gas_splitting" in table_def.keys():
-            col_entity = [''] * len(df_this_table)
-            last_entity = ''
+            col_entity = [""] * len(df_this_table)
+            last_entity = ""
             for i in range(0, len(df_this_table)):
             for i in range(0, len(df_this_table)):
                 current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
                 current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
                 if current_header in table_def["gas_splitting"].keys():
                 if current_header in table_def["gas_splitting"].keys():
@@ -191,8 +217,12 @@ if __name__ == "__main__":
             table_def["index_cols"].append("entity")
             table_def["index_cols"].append("entity")
 
 
         # make a wide table
         # make a wide table
-        df_this_table = make_wide_table(df_this_table, table_def["wide_keyword"],
-                                        table_def["col_wide_kwd"], table_def["index_cols"])
+        df_this_table = make_wide_table(
+            df_this_table,
+            table_def["wide_keyword"],
+            table_def["col_wide_kwd"],
+            table_def["index_cols"],
+        )
 
 
         if "drop_rows" in table_def.keys():
         if "drop_rows" in table_def.keys():
             df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
             df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
@@ -207,11 +237,12 @@ if __name__ == "__main__":
         # add unit
         # add unit
         df_this_table["unit"] = table_def["unit"]
         df_this_table["unit"] = table_def["unit"]
 
 
-        df_this_table = df_this_table.rename({table_def["index_cols"][0]: "orig_cat_name"},
-                                             axis=1)
+        df_this_table = df_this_table.rename(
+            {table_def["index_cols"][0]: "orig_cat_name"}, axis=1
+        )
 
 
         # print(table_def["index_cols"][0])
         # print(table_def["index_cols"][0])
-        # print(df_this_table.columns.values)
+        # print(df_this_table.columns.to_numpy())
 
 
         # make a copy of the categories row
         # make a copy of the categories row
         df_this_table["category"] = df_this_table["orig_cat_name"]
         df_this_table["category"] = df_this_table["orig_cat_name"]
@@ -219,25 +250,30 @@ if __name__ == "__main__":
         # replace cat names by codes in col "category"
         # replace cat names by codes in col "category"
         # first the manual replacements
         # first the manual replacements
         df_this_table["category"] = df_this_table["category"].replace(
         df_this_table["category"] = df_this_table["category"].replace(
-            table_def["cat_codes_manual"])
+            table_def["cat_codes_manual"]
+        )
+
         # then the regex replacements
         # then the regex replacements
-        def repl(m):
-            return m.group('code')
-        df_this_table["category"] = df_this_table["category"].str.replace(cat_code_regexp,
-                                                                          repl, regex=True)
+        def repl(m):  # noqa: D103
+            return m.group("code")
+
+        df_this_table["category"] = df_this_table["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
 
         ### convert to PRIMAP2 IF
         ### convert to PRIMAP2 IF
         # remove ','
         # remove ','
-        time_format = '%Y'
+        time_format = "%Y"
         time_columns = [
         time_columns = [
             col
             col
-            for col in df_this_table.columns.values
+            for col in df_this_table.columns.to_numpy()
             if matches_time_format(col, time_format)
             if matches_time_format(col, time_format)
         ]
         ]
 
 
         for col in time_columns:
         for col in time_columns:
-            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(',', '',
-                                                                              regex=False)
+            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
+                ",", "", regex=False
+            )
 
 
         # drop orig_cat_name as it's not unique per category
         # drop orig_cat_name as it's not unique per category
         df_this_table = df_this_table.drop(columns="orig_cat_name")
         df_this_table = df_this_table.drop(columns="orig_cat_name")
@@ -254,7 +290,7 @@ if __name__ == "__main__":
             # coords_value_filling=coords_value_filling,
             # coords_value_filling=coords_value_filling,
             # filter_remove=filter_remove,
             # filter_remove=filter_remove,
             # filter_keep=filter_keep,
             # filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
         )
 
 
         this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
         this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
@@ -267,7 +303,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
     data_if = data_pm2.pr.to_interchange_format()
 
 
-
     # ###
     # ###
     # convert to IPCC2006 categories
     # convert to IPCC2006 categories
     # ###
     # ###
@@ -275,31 +310,36 @@ if __name__ == "__main__":
     data_if_2006
     data_if_2006
     # filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     # filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace(
     data_if_2006 = data_if_2006.replace(
-        {'category (IPCC2006_1996_Taiwan_Inv)': cat_mapping})
+        {"category (IPCC2006_1996_Taiwan_Inv)": cat_mapping}
+    )
 
 
     # rename the category col
     # rename the category col
-    data_if_2006.rename(
-        columns={'category (IPCC2006_1996_Taiwan_Inv)': 'category (IPCC2006_PRIMAP)'},
-        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == 'category (IPCC2006_1996_Taiwan_Inv)'
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={"category (IPCC2006_1996_Taiwan_Inv)": "category (IPCC2006_PRIMAP)"}
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == "category (IPCC2006_1996_Taiwan_Inv)"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
 
 
     # aggregate categories
     # aggregate categories
     for cat_to_agg in aggregate_cats:
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         df_test = data_if_2006[mask]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -307,8 +347,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -324,19 +371,21 @@ if __name__ == "__main__":
     # aggregate categories
     # aggregate categories
     for cat_to_agg in aggregate_cats_NF3_PFC:
     for cat_to_agg in aggregate_cats_NF3_PFC:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats_NF3_PFC[cat_to_agg]["sources"])
+            aggregate_cats_NF3_PFC[cat_to_agg]["sources"]
+        )
         mask_gas = data_if_2006["entity"].isin(
         mask_gas = data_if_2006["entity"].isin(
-            [f"NF3 ({gwp_to_use})", f"PFCS ({gwp_to_use})"])
+            [f"NF3 ({gwp_to_use})", f"PFCS ({gwp_to_use})"]
+        )
         df_test = data_if_2006[mask & mask_gas]
         df_test = data_if_2006[mask & mask_gas]
 
 
         if len(df_test) > 0:
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
             df_combine = df_test.copy(deep=True)
 
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
             time_columns = [
                 col
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
                 if matches_time_format(col, time_format)
             ]
             ]
 
 
@@ -344,8 +393,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
 
             df_combine = df_combine.groupby(
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -362,7 +418,7 @@ if __name__ == "__main__":
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
 
 
     # convert to mass units from CO2eq
     # convert to mass units from CO2eq
-    entities_to_convert = ['N2O', 'SF6', 'CH4', 'NF3']
+    entities_to_convert = ["N2O", "SF6", "CH4", "NF3"]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
 
 
     for entity in entities_to_convert:
     for entity in entities_to_convert:
@@ -382,19 +438,23 @@ if __name__ == "__main__":
     # save data
     # save data
     # ###
     # ###
     # data in original categories
     # data in original categories
-    pm2.pm2io.write_interchange_format(output_folder /
-                                       (output_filename + coords_terminologies["category"]),
-                                       data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
     encoding = {var: compression for var in data_pm2.data_vars}
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf((output_folder /
-                          (output_filename + coords_terminologies[
-                              "category"])).with_suffix(".nc"),
-                          encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        (
+            output_folder / (output_filename + coords_terminologies["category"])
+        ).with_suffix(".nc"),
+        encoding=encoding,
+    )
 
 
     # data in 2006 categories
     # data in 2006 categories
-    pm2.pm2io.write_interchange_format(output_folder /
-                                       (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf((output_folder /
-                                (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
-                               encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        (output_folder / (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Thailand/__init__.py

@@ -0,0 +1,30 @@
+"""Read Thailand's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'THA'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=THA
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 405 - 223
src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur3.py

@@ -1,38 +1,54 @@
-# configuration for Thailand, BUR4
+"""Config for Thailand's BUR4
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
+
 # ###
 # ###
 # for reading
 # for reading
 # ###
 # ###
 
 
 # general
 # general
 gwp_to_use = "AR4GWP100"
 gwp_to_use = "AR4GWP100"
-terminology_proc = 'IPCC2006_PRIMAP'
+terminology_proc = "IPCC2006_PRIMAP"
 
 
-header_inventory = ['Greenhouse gas source and sink categories',
-                   'CO2 emissions', 'CO2 removals',
-                   'CH4', 'N2O', 'NOx', 'CO', 'NMVOCs',
-                   'SO2', 'HFCs', 'PFCs', 'SF6']
-unit_inventory = ['Gg'] * len(header_inventory)
+header_inventory = [
+    "Greenhouse gas source and sink categories",
+    "CO2 emissions",
+    "CO2 removals",
+    "CH4",
+    "N2O",
+    "NOx",
+    "CO",
+    "NMVOCs",
+    "SO2",
+    "HFCs",
+    "PFCs",
+    "SF6",
+]
+unit_inventory = ["Gg"] * len(header_inventory)
 unit_inventory[9] = "GgCO2eq"
 unit_inventory[9] = "GgCO2eq"
 unit_inventory[10] = "GgCO2eq"
 unit_inventory[10] = "GgCO2eq"
 
 
 # 2019 inventory
 # 2019 inventory
 inv_conf = {
 inv_conf = {
-    'year': 2016,
-    'entity_row': 0,
-    'unit_row': 1,
-    'index_cols': "Greenhouse gas source and sink categories",
-    'header': header_inventory,
-    'unit': unit_inventory,
+    "year": 2016,
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
+    "header": header_inventory,
+    "unit": unit_inventory,
     # special header as category UNFCCC_GHG_data and name in one column
     # special header as category UNFCCC_GHG_data and name in one column
-    'header_long': ["orig_cat_name", "entity", "unit", "time", "data"],
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # # automatically with the other codes)
     # # automatically with the other codes)
-    'cat_codes_manual': {
-        '6. Other Memo Items (not accounted in Total Emissions)': 'MEMO',
-        'International Bunkers': 'MBK',
-        'CO2 from Biomass': 'MBIO',
+    "cat_codes_manual": {
+        "6. Other Memo Items (not accounted in Total Emissions)": "MEMO",
+        "International Bunkers": "MBK",
+        "CO2 from Biomass": "MBIO",
     },
     },
-    'cat_code_regexp': r'^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*',
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*",
 }
 }
 
 
 # primap2 format conversion
 # primap2 format conversion
@@ -59,14 +75,14 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
     "entity": {
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'NMVOCs': 'NMVOC',
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
     },
     },
 }
 }
 
 
 filter_remove = {
 filter_remove = {
-    'f_memo': {"category": "MEMO"},
+    "f_memo": {"category": "MEMO"},
 }
 }
 filter_keep = {}
 filter_keep = {}
 
 
@@ -81,26 +97,31 @@ meta_data = {
 
 
 # main sector time series
 # main sector time series
 header_main_sector_ts = [
 header_main_sector_ts = [
-    'Year', 'Energy', 'IPPU',
-    'Agriculture', 'LULUCF', 'Waste',
-    'Net emissions (Including LULUCF)',
-    'Net emissions (Excluding LULUCF)']
-unit_main_sector_ts = ['GgCO2eq'] * len(header_main_sector_ts)
-unit_main_sector_ts[0] = ''
+    "Year",
+    "Energy",
+    "IPPU",
+    "Agriculture",
+    "LULUCF",
+    "Waste",
+    "Net emissions (Including LULUCF)",
+    "Net emissions (Excluding LULUCF)",
+]
+unit_main_sector_ts = ["GgCO2eq"] * len(header_main_sector_ts)
+unit_main_sector_ts[0] = ""
 
 
 trend_conf = {
 trend_conf = {
-    'header': header_main_sector_ts,
-    'unit': unit_main_sector_ts,
+    "header": header_main_sector_ts,
+    "unit": unit_main_sector_ts,
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # automatically with the other codes)
     # automatically with the other codes)
-    'cat_codes_manual': {
-        'Energy': "1",
-        'IPPU': "2",
-        'Agriculture': "3",
-        'LULUCF': "4",
-        'Waste': "5",
-        'Net emissions (Including LULUCF)': "0",
-        'Net emissions (Excluding LULUCF)': "M0EL",
+    "cat_codes_manual": {
+        "Energy": "1",
+        "IPPU": "2",
+        "Agriculture": "3",
+        "LULUCF": "4",
+        "Waste": "5",
+        "Net emissions (Including LULUCF)": "0",
+        "Net emissions (Excluding LULUCF)": "M0EL",
     },
     },
 }
 }
 
 
@@ -118,14 +139,13 @@ coords_defaults_main_sector_ts = {
 }
 }
 
 
 # indirect gases time series
 # indirect gases time series
-header_indirect = ['Year', 'NOx', 'CO',
-                    'NMVOCs', 'SO2']
-unit_indirect = ['Gg'] * len(header_indirect)
-unit_indirect[0] = ''
+header_indirect = ["Year", "NOx", "CO", "NMVOCs", "SO2"]
+unit_indirect = ["Gg"] * len(header_indirect)
+unit_indirect[0] = ""
 ind_conf = {
 ind_conf = {
-    'header': header_indirect,
-    'unit': unit_indirect,
-    'cols_to_remove': ['Average Annual Growth Rate'],
+    "header": header_indirect,
+    "unit": unit_indirect,
+    "cols_to_remove": ["Average Annual Growth Rate"],
 }
 }
 
 
 coords_cols_indirect = {
 coords_cols_indirect = {
@@ -146,111 +166,203 @@ coords_defaults_indirect = {
 # ###
 # ###
 # aggregate categories
 # aggregate categories
 country_processing_step1 = {
 country_processing_step1 = {
-    'aggregate_cats': {
-        '2.A.4': {'sources': ['2.A.4.b', '2.A.4.d'],
-                  'name': 'Other Process uses of Carbonates'},
+    "aggregate_cats": {
+        "2.A.4": {
+            "sources": ["2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates",
+        },
     },
     },
-    'aggregate_gases': {
-        'KYOTOGHG': {
-            'basket': 'KYOTOGHG (AR4GWP100)',
-            'basket_contents': ['CO2', 'CH4', 'N2O', 'SF6',
-                                'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)'],
-            'skipna': True,
-            'min_count': 1,
-            'sel': {f'category ({coords_terminologies["category"]})':
-                [
-                    '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                    '1.A.4', '1.B', '1.B.1', '1.B.2',
-                    '1.C',
-                    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                    '2.B', '2.C', '2.D', '2.H',
-                    '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                    '3.H', '3.I',
-                    '4', '4.A', '4.B', '4.C', '4.D', '4.E',
-                    '5', '5.A', '5.B', '5.C', '5.D'
+    "aggregate_gases": {
+        "KYOTOGHG": {
+            "basket": "KYOTOGHG (AR4GWP100)",
+            "basket_contents": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "HFCS (AR4GWP100)",
+                "PFCS (AR4GWP100)",
+            ],
+            "skipna": True,
+            "min_count": 1,
+            "sel": {
+                f'category ({coords_terminologies["category"]})': [
+                    "0",
+                    "1",
+                    "1.A",
+                    "1.A.1",
+                    "1.A.2",
+                    "1.A.3",
+                    "1.A.4",
+                    "1.B",
+                    "1.B.1",
+                    "1.B.2",
+                    "1.C",
+                    "2",
+                    "2.A",
+                    "2.A.1",
+                    "2.A.2",
+                    "2.A.3",
+                    "2.A.4",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.H",
+                    "3",
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                    "4",
+                    "4.A",
+                    "4.B",
+                    "4.C",
+                    "4.D",
+                    "4.E",
+                    "5",
+                    "5.A",
+                    "5.B",
+                    "5.C",
+                    "5.D",
                 ]
                 ]
-            }, # not tested
+            },  # not tested
         },
         },
     },
     },
 }
 }
 
 
 country_processing_step2 = {
 country_processing_step2 = {
-    'downscale': {
+    "downscale": {
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '2': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.H'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2": {
+                "basket": "2",
+                "basket_contents": ["2.A", "2.B", "2.C", "2.D", "2.H"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '2.A': {
-                'basket': '2.A',
-                'basket_contents': ['2.A.1', '2.A.2', '2.A.3', '2.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2.A": {
+                "basket": "2.A",
+                "basket_contents": ["2.A.1", "2.A.2", "2.A.3", "2.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                                    '3.H', '3.I'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "3": {
+                "basket": "3",
+                "basket_contents": [
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                ],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '4': {
-                'basket': '4',
-                'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4": {
+                "basket": "4",
+                "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '5': {
-                'basket': '5',
-                'basket_contents': ['5.A', '5.B', '5.C', '5.D'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "5": {
+                "basket": "5",
+                "basket_contents": ["5.A", "5.B", "5.C", "5.D"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
         },
         },
-        'entities': {
-            'KYOTO': {
-                'basket': 'KYOTOGHG (AR4GWP100)',
-                'basket_contents': ['CH4', 'CO2', 'N2O', 'HFCS (AR4GWP100)',
-                                    'PFCS (AR4GWP100)', 'SF6'],
-                'sel': {f'category ({coords_terminologies["category"]})':
-                    [
-                        '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                        '1.A.4', '1.B', '1.B.1', '1.B.2', '1.C',
-                        '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                        '2.B', '2.C', '2.D', '2.H',
-                        '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                        '3.H', '3.I',
-                        '4', '4.A', '4.B', '4.C', '4.D', '4.E',
-                        '5', '5.A', '5.B', '5.C', '5.D']},
+        "entities": {
+            "KYOTO": {
+                "basket": "KYOTOGHG (AR4GWP100)",
+                "basket_contents": [
+                    "CH4",
+                    "CO2",
+                    "N2O",
+                    "HFCS (AR4GWP100)",
+                    "PFCS (AR4GWP100)",
+                    "SF6",
+                ],
+                "sel": {
+                    f'category ({coords_terminologies["category"]})': [
+                        "0",
+                        "1",
+                        "1.A",
+                        "1.A.1",
+                        "1.A.2",
+                        "1.A.3",
+                        "1.A.4",
+                        "1.B",
+                        "1.B.1",
+                        "1.B.2",
+                        "1.C",
+                        "2",
+                        "2.A",
+                        "2.A.1",
+                        "2.A.2",
+                        "2.A.3",
+                        "2.A.4",
+                        "2.B",
+                        "2.C",
+                        "2.D",
+                        "2.H",
+                        "3",
+                        "3.A",
+                        "3.B",
+                        "3.C",
+                        "3.D",
+                        "3.E",
+                        "3.F",
+                        "3.G",
+                        "3.H",
+                        "3.I",
+                        "4",
+                        "4.A",
+                        "4.B",
+                        "4.C",
+                        "4.D",
+                        "4.E",
+                        "5",
+                        "5.A",
+                        "5.B",
+                        "5.C",
+                        "5.D",
+                    ]
+                },
             },
             },
         },
         },
     },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }
 ## not in BUR3: 1.A.1.a, 1.A.1.b, 1.A.3.a, 1.A.3.b, 1.A.3.c, 1.A.3.d, 1.A.5, 1.B.3,
 ## not in BUR3: 1.A.1.a, 1.A.1.b, 1.A.3.a, 1.A.3.b, 1.A.3.c, 1.A.3.d, 1.A.5, 1.B.3,
@@ -258,106 +370,176 @@ country_processing_step2 = {
 # 4.E.x, 5.X.y M.BK.A, M.BK.M
 # 4.E.x, 5.X.y M.BK.A, M.BK.M
 
 
 cat_conversion = {
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        'M.0.EL': 'M.0.EL',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '1.C': '1.C',
-        '1.C.1': '1.C.1',
-        '1.C.2': '1.C.2',
-        '1.C.3': '1.C.3',
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1',
-        '2.A.2': '2.A.2',
-        '2.A.3': '2.A.3',
-        '2.A.4': '2.A.4',
-        '2.A.4.b': '2.A.4.b',
-        '2.A.4.d': '2.A.4.d',
-        '2.B': '2.B',
-        '2.C': '2.C',
-        '2.C.1': '2.C.1',
-        '2.D': '2.D',
-        '2.D.1': '2.D.1',
-        '2.H': '2.H',
-        '2.H.1': '2.H.1',
-        '2.H.2': '2.H.2',
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': 'M.3.C.1.AG',  # field burning of agricultural residues
-        '3.D': '3.C.2',  # Liming
-        '3.E': '3.C.3',  # urea application
-        '3.F': '3.C.4',  # direct N2O from agri soils
-        '3.G': '3.C.5',  # indirect N2O from agri soils
-        '3.H': '3.C.6',  # indirect N2O from manure management
-        '3.I': '3.C.7',  # rice
-        '4': 'M.LULUCF',
-        '4.A': '3.B.1.a',  # forest remaining forest
-        '4.B': '3.B.2.a',  # cropland remaining cropland
-        '4.C': '3.B.2.b',  # land converted to cropland
-        '4.D': '3.B.6.b',  # land converted to other land
-        '4.E': 'M.3.C.1.LU',  # biomass burning (LULUCF)
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        'M.BK': 'M.BK',
-        'M.BIO': 'M.BIO',
+    "mapping": {
+        "0": "0",
+        "M.0.EL": "M.0.EL",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "1.C": "1.C",
+        "1.C.1": "1.C.1",
+        "1.C.2": "1.C.2",
+        "1.C.3": "1.C.3",
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",
+        "2.A.2": "2.A.2",
+        "2.A.3": "2.A.3",
+        "2.A.4": "2.A.4",
+        "2.A.4.b": "2.A.4.b",
+        "2.A.4.d": "2.A.4.d",
+        "2.B": "2.B",
+        "2.C": "2.C",
+        "2.C.1": "2.C.1",
+        "2.D": "2.D",
+        "2.D.1": "2.D.1",
+        "2.H": "2.H",
+        "2.H.1": "2.H.1",
+        "2.H.2": "2.H.2",
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "M.3.C.1.AG",  # field burning of agricultural residues
+        "3.D": "3.C.2",  # Liming
+        "3.E": "3.C.3",  # urea application
+        "3.F": "3.C.4",  # direct N2O from agri soils
+        "3.G": "3.C.5",  # indirect N2O from agri soils
+        "3.H": "3.C.6",  # indirect N2O from manure management
+        "3.I": "3.C.7",  # rice
+        "4": "M.LULUCF",
+        "4.A": "3.B.1.a",  # forest remaining forest
+        "4.B": "3.B.2.a",  # cropland remaining cropland
+        "4.C": "3.B.2.b",  # land converted to cropland
+        "4.D": "3.B.6.b",  # land converted to other land
+        "4.E": "M.3.C.1.LU",  # biomass burning (LULUCF)
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "M.BK": "M.BK",
+        "M.BIO": "M.BIO",
     },
     },
-    'aggregate': {
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['M.3.C.1.AG', 'M.3.C.1.LU'],
-                  'name': 'Emissions from Biomass Burning'},
-        '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {
-            'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        'M.3.C.LU': {'sources': ['M.3.C.1.LU'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land (Land use)'},
-        '3.B.1': {'sources': ['3.B.1.a'], 'name': 'Forest Land'},
-        '3.B.2': {'sources': ['3.B.2.a', '3.B.2.b'], 'name': 'Cropland'},
-        '3.B.6': {'sources': ['3.B.6.b'], 'name': 'Other Land'},
-        '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.6'], 'name': 'Land'},
-        'M.LULUCF': {'sources': ['3.B', 'N.3.C.LU'], 'name': 'LULUCF'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "aggregate": {
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {
+            "sources": ["M.3.C.1.AG", "M.3.C.1.LU"],
+            "name": "Emissions from Biomass Burning",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.2", "3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": [
+                "M.3.C.1.AG",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.3.C.LU": {
+            "sources": ["M.3.C.1.LU"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Land use)",
+        },
+        "3.B.1": {"sources": ["3.B.1.a"], "name": "Forest Land"},
+        "3.B.2": {"sources": ["3.B.2.a", "3.B.2.b"], "name": "Cropland"},
+        "3.B.6": {"sources": ["3.B.6.b"], "name": "Other Land"},
+        "3.B": {"sources": ["3.B.1", "3.B.2", "3.B.6"], "name": "Land"},
+        "M.LULUCF": {"sources": ["3.B", "N.3.C.LU"], "name": "LULUCF"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
     },
 }
 }
 
 
 sectors_to_save = [
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.2', '1.A.3', '1.A.4',
-    '1.B', '1.B.1', '1.B.2', '1.C', '1.C.1', '1.C.2', '1.C.3',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.4.b', '2.A.4.d',
-    '2.B', '2.C', '2.C.1', '2.H', '2.H.1', '2.H.2',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', '3.C.2', '3.C.3', '3.C.4',
-    '3.C.5', '3.C.6', '3.C.7', 'M.3.C.1.AG', 'M.3.C.AG', 'M.AG.ELV',
-    'M.LULUCF', 'M.3.C.1.LU', 'M.3.C.LU', '3.B', '3.B.1', '3.B.1.a', '3.B.2', '3.B.2.a',
-    '3.B.2.b', '3.B.6', '3.B.6.b',
-    '4', '4.A', '4.B', '4.C', '4.D',
-    '0', 'M.0.EL', 'M.BK', 'M.BIO']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.2",
+    "1.A.3",
+    "1.A.4",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "1.C",
+    "1.C.1",
+    "1.C.2",
+    "1.C.3",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.4.b",
+    "2.A.4.d",
+    "2.B",
+    "2.C",
+    "2.C.1",
+    "2.H",
+    "2.H.1",
+    "2.H.2",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "3.C.2",
+    "3.C.3",
+    "3.C.4",
+    "3.C.5",
+    "3.C.6",
+    "3.C.7",
+    "M.3.C.1.AG",
+    "M.3.C.AG",
+    "M.AG.ELV",
+    "M.LULUCF",
+    "M.3.C.1.LU",
+    "M.3.C.LU",
+    "3.B",
+    "3.B.1",
+    "3.B.1.a",
+    "3.B.2",
+    "3.B.2.a",
+    "3.B.2.b",
+    "3.B.6",
+    "3.B.6.b",
+    "4",
+    "4.A",
+    "4.B",
+    "4.C",
+    "4.D",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BIO",
+]
 
 
 
 
 # gas baskets
 # gas baskets
 gas_baskets = {
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 }

+ 461 - 250
src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur4.py

@@ -1,31 +1,35 @@
-# configuration for Thailand, BUR4
+"""Config for Thailand's BUR5
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
 # ###
 # ###
 # for reading
 # for reading
 # ###
 # ###
 
 
 # general
 # general
 gwp_to_use = "AR4GWP100"
 gwp_to_use = "AR4GWP100"
-terminology_proc = 'IPCC2006_PRIMAP'
+terminology_proc = "IPCC2006_PRIMAP"
 
 
 # 2019 inventory
 # 2019 inventory
 inv_conf = {
 inv_conf = {
-    'year': 2019,
-    'entity_row': 0,
-    'unit_row': 1,
-    'index_cols': "Greenhouse gas source and sink categories",
+    "year": 2019,
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
     # special header as category UNFCCC_GHG_data and name in one column
     # special header as category UNFCCC_GHG_data and name in one column
-    'header_long': ["orig_cat_name", "entity", "unit", "time", "data"],
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # # automatically with the other codes)
     # # automatically with the other codes)
-    'cat_codes_manual': {
-        'Total national emissions and removals': '0',
-        'Memo Items (not accounted in total Emissions)': 'MEMO',
-        'International Bunkers': 'MBK',
-        'Aviation International Bunkers': 'MBKA',
-        'Marine-International Bunkers': 'MBKM',
-        'CO2 from biomass': 'MBIO',
+    "cat_codes_manual": {
+        "Total national emissions and removals": "0",
+        "Memo Items (not accounted in total Emissions)": "MEMO",
+        "International Bunkers": "MBK",
+        "Aviation International Bunkers": "MBKA",
+        "Marine-International Bunkers": "MBKM",
+        "CO2 from biomass": "MBIO",
     },
     },
-    'cat_code_regexp': r'^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*',
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*",
 }
 }
 
 
 # primap2 format conversion
 # primap2 format conversion
@@ -52,16 +56,16 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
     "entity": {
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'SF6': f'SF6 ({gwp_to_use})',
-        'NMVOCs': 'NMVOC',
-        'Nox': 'NOx',
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
+        "Nox": "NOx",
     },
     },
 }
 }
 
 
 filter_remove = {
 filter_remove = {
-    'f_memo': {"category": "MEMO"},
+    "f_memo": {"category": "MEMO"},
 }
 }
 filter_keep = {}
 filter_keep = {}
 
 
@@ -78,13 +82,13 @@ meta_data = {
 # manual category codes (manual mapping to primap1, will be mapped to primap2
 # manual category codes (manual mapping to primap1, will be mapped to primap2
 # automatically with the other codes)
 # automatically with the other codes)
 cat_codes_manual_main_sector_ts = {
 cat_codes_manual_main_sector_ts = {
-    'Energy': "1",
-    'Industrial Processes and Product Use': "2",
-    'Agriculture': "3",
-    'LULUCF': "4",
-    'Waste': "5",
-    'Net emissions (Include LULUCF)': "0",
-    'Total emissions (Exclude LULUCF)': "M0EL",
+    "Energy": "1",
+    "Industrial Processes and Product Use": "2",
+    "Agriculture": "3",
+    "LULUCF": "4",
+    "Waste": "5",
+    "Net emissions (Include LULUCF)": "0",
+    "Total emissions (Exclude LULUCF)": "M0EL",
 }
 }
 
 
 coords_cols_main_sector_ts = {
 coords_cols_main_sector_ts = {
@@ -119,263 +123,470 @@ coords_defaults_indirect = {
 # ###
 # ###
 # aggregate categories
 # aggregate categories
 country_processing_step1 = {
 country_processing_step1 = {
-    'aggregate_cats': {
-        '2.A.4': {'sources': ['2.A.4.b', '2.A.4.d'],
-                  'name': 'Other Process uses of Carbonates'},
-        '2.B.8': {'sources': ['2.B.8.b', '2.B.8.c', '2.B.8.e', '2.B.8.f'],
-                  'name': 'Petrochemical and Carbon Black production'},
+    "aggregate_cats": {
+        "2.A.4": {
+            "sources": ["2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates",
+        },
+        "2.B.8": {
+            "sources": ["2.B.8.b", "2.B.8.c", "2.B.8.e", "2.B.8.f"],
+            "name": "Petrochemical and Carbon Black production",
+        },
     },
     },
-    'aggregate_gases': {
-        'KYOTOGHG': {
-            'basket': 'KYOTOGHG (AR4GWP100)',
-            'basket_contents': ['CO2', 'CH4', 'N2O', 'SF6',
-                                'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)'],
-            'skipna': True,
-            'min_count': 1,
-            'sel': {f'category ({coords_terminologies["category"]})':
-                [
-                    '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                    '1.A.4', '1.A.5', '1.B', '1.B.1', '1.B.2', '1.B.3',
-                    '1.C',
-                    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                    '2.B', '2.C', '2.D', '2.F', '2.G', '2.H',
-                    '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                    '3.H', '3.I',
-                    '4', '4.A', '4.B', '4.C', '4.D',
-                    '4.E', '4.E.1', '4.E.2', '4.E.3',
-                    '5', '5.A', '5.B', '5.C', '5.D'
+    "aggregate_gases": {
+        "KYOTOGHG": {
+            "basket": "KYOTOGHG (AR4GWP100)",
+            "basket_contents": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "HFCS (AR4GWP100)",
+                "PFCS (AR4GWP100)",
+            ],
+            "skipna": True,
+            "min_count": 1,
+            "sel": {
+                f'category ({coords_terminologies["category"]})': [
+                    "0",
+                    "1",
+                    "1.A",
+                    "1.A.1",
+                    "1.A.2",
+                    "1.A.3",
+                    "1.A.4",
+                    "1.A.5",
+                    "1.B",
+                    "1.B.1",
+                    "1.B.2",
+                    "1.B.3",
+                    "1.C",
+                    "2",
+                    "2.A",
+                    "2.A.1",
+                    "2.A.2",
+                    "2.A.3",
+                    "2.A.4",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                    "3",
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                    "4",
+                    "4.A",
+                    "4.B",
+                    "4.C",
+                    "4.D",
+                    "4.E",
+                    "4.E.1",
+                    "4.E.2",
+                    "4.E.3",
+                    "5",
+                    "5.A",
+                    "5.B",
+                    "5.C",
+                    "5.D",
                 ]
                 ]
-            }, # not tested
+            },  # not tested
         },
         },
     },
     },
 }
 }
 
 
 country_processing_step2 = {
 country_processing_step2 = {
-    'downscale': {
+    "downscale": {
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4', '1.A.5'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4", "1.A.5"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2', '1.B.3'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2", "1.B.3"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '2': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.F', '2.G', '2.H'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2": {
+                "basket": "2",
+                "basket_contents": ["2.A", "2.B", "2.C", "2.D", "2.F", "2.G", "2.H"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '2.A': {
-                'basket': '2.A',
-                'basket_contents': ['2.A.1', '2.A.2', '2.A.3', '2.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2.A": {
+                "basket": "2.A",
+                "basket_contents": ["2.A.1", "2.A.2", "2.A.3", "2.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                                    '3.H', '3.I'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "3": {
+                "basket": "3",
+                "basket_contents": [
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                ],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '4': {
-                'basket': '4',
-                'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4": {
+                "basket": "4",
+                "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '4.E': {
-                'basket': '4.E',
-                'basket_contents': ['4.E.1', '4.E.2', '4.E.3'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4.E": {
+                "basket": "4.E",
+                "basket_contents": ["4.E.1", "4.E.2", "4.E.3"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
-            '5': {
-                'basket': '5',
-                'basket_contents': ['5.A', '5.B', '5.C', '5.D'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "5": {
+                "basket": "5",
+                "basket_contents": ["5.A", "5.B", "5.C", "5.D"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
             },
         },
         },
-        'entities': {
-            'KYOTO': {
-                'basket': 'KYOTOGHG (AR4GWP100)',
-                'basket_contents': ['CH4', 'CO2', 'N2O', 'HFCS (AR4GWP100)',
-                                    'PFCS (AR4GWP100)', 'SF6'],
-                'sel': {f'category ({coords_terminologies["category"]})':
-                    [
-                        '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                        '1.A.4', '1.A.5', '1.B', '1.B.1', '1.B.2', '1.B.3',
-                        '1.C',
-                        '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                        '2.B', '2.C', '2.D', '2.F', '2.G', '2.H',
-                        '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                        '3.H', '3.I',
-                        '4', '4.A', '4.B', '4.C', '4.D',
-                        '4.E', '4.E.1', '4.E.2', '4.E.3',
-                        '5', '5.A', '5.B', '5.C', '5.D']},
+        "entities": {
+            "KYOTO": {
+                "basket": "KYOTOGHG (AR4GWP100)",
+                "basket_contents": [
+                    "CH4",
+                    "CO2",
+                    "N2O",
+                    "HFCS (AR4GWP100)",
+                    "PFCS (AR4GWP100)",
+                    "SF6",
+                ],
+                "sel": {
+                    f'category ({coords_terminologies["category"]})': [
+                        "1",
+                        "1.A",
+                        "1.A.1",
+                        "1.A.2",
+                        "1.A.3",
+                        "1.A.4",
+                        "1.A.5",
+                        "1.B",
+                        "1.B.1",
+                        "1.B.2",
+                        "1.B.3",
+                        "1.C",
+                        "2",
+                        "2.A",
+                        "2.A.1",
+                        "2.A.2",
+                        "2.A.3",
+                        "2.A.4",
+                        "2.B",
+                        "2.C",
+                        "2.D",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                        "3",
+                        "3.A",
+                        "3.B",
+                        "3.C",
+                        "3.D",
+                        "3.E",
+                        "3.F",
+                        "3.G",
+                        "3.H",
+                        "3.I",
+                        "4",
+                        "4.A",
+                        "4.B",
+                        "4.C",
+                        "4.D",
+                        "4.E",
+                        "4.E.1",
+                        "4.E.2",
+                        "4.E.3",
+                        "5",
+                        "5.A",
+                        "5.B",
+                        "5.C",
+                        "5.D",
+                    ]
+                },
             },
             },
         },
         },
     },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
     },
 }
 }
 
 
 cat_conversion = {
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        'M.0.EL': 'M.0.EL',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.1.a': '1.A.1.a',
-        '1.A.1.b': '1.A.1.b',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.3.a': '1.A.3.a',
-        '1.A.3.b': '1.A.3.b',
-        '1.A.3.c': '1.A.3.c',
-        '1.A.3.d': '1.A.3.d',
-        '1.A.4': '1.A.4',
-        '1.A.5': '1.A.5',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '1.B.3': '1.B.3',
-        '1.C': '1.C',
-        '1.C.1': '1.C.1',
-        '1.C.2': '1.C.2',
-        '1.C.3': '1.C.3',
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1',
-        '2.A.2': '2.A.2',
-        '2.A.3': '2.A.3',
-        '2.A.4': '2.A.4',
-        '2.A.4.b': '2.A.4.b',
-        '2.A.4.d': '2.A.4.d',
-        '2.B': '2.B',
-        '2.B.2': '2.B.2',
-        '2.B.4': '2.B.4',
-        '2.B.8': '2.B.8',
-        '2.B.8.b': '2.B.8.b',
-        '2.B.8.c': '2.B.8.c',
-        '2.B.8.e': '2.B.8.e',
-        '2.B.8.f': '2.B.8.f',
-        '2.C': '2.C',
-        '2.C.1': '2.C.1',
-        '2.D': '2.D',
-        '2.D.1': '2.D.1',
-        '2.F': '2.F',
-        '2.F.1': '2.F.1',
-        '2.G': '2.G',
-        '2.G.1': '2.G.1',
-        '2.H': '2.H',
-        '2.H.1': '2.H.1',
-        '2.H.2': '2.H.2',
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': 'M.3.C.1.b.i',  # field burning of agricultural residues
-        '3.D': '3.C.2',  # Liming
-        '3.E': '3.C.3',  # urea application
-        '3.F': '3.C.4',  # direct N2O from agri soils
-        '3.G': '3.C.5',  # indirect N2O from agri soils
-        '3.H': '3.C.6',  # indirect N2O from manure management
-        '3.I': '3.C.7',  # rice
+    "mapping": {
+        "0": "0",
+        "M.0.EL": "M.0.EL",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.1.a": "1.A.1.a",
+        "1.A.1.b": "1.A.1.b",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.3.a": "1.A.3.a",
+        "1.A.3.b": "1.A.3.b",
+        "1.A.3.c": "1.A.3.c",
+        "1.A.3.d": "1.A.3.d",
+        "1.A.4": "1.A.4",
+        "1.A.5": "1.A.5",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "1.B.3": "1.B.3",
+        "1.C": "1.C",
+        "1.C.1": "1.C.1",
+        "1.C.2": "1.C.2",
+        "1.C.3": "1.C.3",
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",
+        "2.A.2": "2.A.2",
+        "2.A.3": "2.A.3",
+        "2.A.4": "2.A.4",
+        "2.A.4.b": "2.A.4.b",
+        "2.A.4.d": "2.A.4.d",
+        "2.B": "2.B",
+        "2.B.2": "2.B.2",
+        "2.B.4": "2.B.4",
+        "2.B.8": "2.B.8",
+        "2.B.8.b": "2.B.8.b",
+        "2.B.8.c": "2.B.8.c",
+        "2.B.8.e": "2.B.8.e",
+        "2.B.8.f": "2.B.8.f",
+        "2.C": "2.C",
+        "2.C.1": "2.C.1",
+        "2.D": "2.D",
+        "2.D.1": "2.D.1",
+        "2.F": "2.F",
+        "2.F.1": "2.F.1",
+        "2.G": "2.G",
+        "2.G.1": "2.G.1",
+        "2.H": "2.H",
+        "2.H.1": "2.H.1",
+        "2.H.2": "2.H.2",
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "M.3.C.1.b.i",  # field burning of agricultural residues
+        "3.D": "3.C.2",  # Liming
+        "3.E": "3.C.3",  # urea application
+        "3.F": "3.C.4",  # direct N2O from agri soils
+        "3.G": "3.C.5",  # indirect N2O from agri soils
+        "3.H": "3.C.6",  # indirect N2O from manure management
+        "3.I": "3.C.7",  # rice
         #'4': 'M.LULUCF',
         #'4': 'M.LULUCF',
-        '4.A': '3.B.1.a',  # forest remaining forest
-        '4.B': '3.B.2.a',  # cropland remaining cropland
-        '4.C': '3.B.2.b',  # land converted to cropland
-        '4.D': '3.B.6.b',  # land converted to other land
+        "4.A": "3.B.1.a",  # forest remaining forest
+        "4.B": "3.B.2.a",  # cropland remaining cropland
+        "4.C": "3.B.2.b",  # land converted to cropland
+        "4.D": "3.B.6.b",  # land converted to other land
         #'4.E': 'M.3.C.1.LU',  # biomass burning (LULUCF)
         #'4.E': 'M.3.C.1.LU',  # biomass burning (LULUCF)
-        '4.E.1': '3.C.1.a', # biomass burning (Forest Land)
-        '4.E.2': 'M.3.C.1.b.ii', # biomass burning (Cropland)
-        '4.E.3': '3.C.1.d', # biomass burning (Other Land)
-        '5': '4',
-        '5.A': '4.A',
-        '5.A.1': '4.A.1',
-        '5.A.2': '4.A.2',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.C.1': '4.C.1',
-        '5.D': '4.D',
-        '5.D.1': '4.D.1',
-        '5.D.2': '4.D.2',
-        'M.BK': 'M.BK',
-        'M.BK.A': 'M.BK.A',
-        'M.BK.M': 'M.BM.M',
-        'M.BIO': 'M.BIO',
+        "4.E.1": "3.C.1.a",  # biomass burning (Forest Land)
+        "4.E.2": "M.3.C.1.b.ii",  # biomass burning (Cropland)
+        "4.E.3": "3.C.1.d",  # biomass burning (Other Land)
+        "5": "4",
+        "5.A": "4.A",
+        "5.A.1": "4.A.1",
+        "5.A.2": "4.A.2",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.C.1": "4.C.1",
+        "5.D": "4.D",
+        "5.D.1": "4.D.1",
+        "5.D.2": "4.D.2",
+        "M.BK": "M.BK",
+        "M.BK.A": "M.BK.A",
+        "M.BK.M": "M.BM.M",
+        "M.BIO": "M.BIO",
     },
     },
-    'aggregate': {
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1.b': {'sources': ['M.3.C.1.b.i', 'M.3.C.1.b.ii'],
-                  'name': 'Biomass Burning In Cropland'},
-        'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
-                  'name': 'Biomass Burning (Agriculture)'},
-        'M.3.C.1.LU': {'sources': ['3.C.1.a', '3.C.1.d'],
-                  'name': 'Biomass Burning (LULUCF)'},
-        '3.C.1': {'sources': ['M.3.C.1.AG', 'M.3.C.1.LU'],
-                  'name': 'Emissions from Biomass Burning'},
-        '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {
-            'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        'M.3.C.LU': {'sources': ['M.3.C.1.LU'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land (Land use)'},
-        '3.B.1': {'sources': ['3.B.1.a'], 'name': 'Forest Land'},
-        '3.B.2': {'sources': ['3.B.2.a', '3.B.2.b'], 'name': 'Cropland'},
-        '3.B.6': {'sources': ['3.B.6.b'], 'name': 'Other Land'},
-        '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.6'], 'name': 'Land'},
-        'M.LULUCF': {'sources': ['3.B', 'N.3.C.LU'], 'name': 'LULUCF'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "aggregate": {
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1.b": {
+            "sources": ["M.3.C.1.b.i", "M.3.C.1.b.ii"],
+            "name": "Biomass Burning In Cropland",
+        },
+        "M.3.C.1.AG": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Biomass Burning (Agriculture)",
+        },
+        "M.3.C.1.LU": {
+            "sources": ["3.C.1.a", "3.C.1.d"],
+            "name": "Biomass Burning (LULUCF)",
+        },
+        "3.C.1": {
+            "sources": ["M.3.C.1.AG", "M.3.C.1.LU"],
+            "name": "Emissions from Biomass Burning",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.2", "3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": [
+                "M.3.C.1.AG",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.3.C.LU": {
+            "sources": ["M.3.C.1.LU"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Land use)",
+        },
+        "3.B.1": {"sources": ["3.B.1.a"], "name": "Forest Land"},
+        "3.B.2": {"sources": ["3.B.2.a", "3.B.2.b"], "name": "Cropland"},
+        "3.B.6": {"sources": ["3.B.6.b"], "name": "Other Land"},
+        "3.B": {"sources": ["3.B.1", "3.B.2", "3.B.6"], "name": "Land"},
+        "M.LULUCF": {"sources": ["3.B", "N.3.C.LU"], "name": "LULUCF"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
     },
 }
 }
 
 
 sectors_to_save = [
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.1.a', '1.A.1.b', '1.A.2', '1.A.3', '1.A.3.a', '1.A.3.b',
-    '1.A.3.c', '1.A.3.d', '1.A.4', '1.A.5',
-    '1.B', '1.B.1', '1.B.2', '1.B.3', '1.C', '1.C.1', '1.C.2', '1.C.3',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.4.b', '2.A.4.d',
-    '2.B', '2.B.2', '2.B.4', '2.B.8', '2.B.8.a', '2.B.8.c', '2.B.8.e', '2.B.8.f',
-    '2.C', '2.C.1', '2.F', '2.F.1', '2.G', '2.G.1', '2.H', '2.H.1', '2.H.2',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', '3.C.1.a', '3.C.1.b', '3.C.1.d', '3.C.2', '3.C.3', '3.C.4',
-    '3.C.5', '3.C.6', '3.C.7', 'M.3.C.1.AG', 'M.3.C.AG', 'M.AG.ELV',
-    'M.LULUCF', 'M.3.C.1.LU', 'M.3.C.LU', '3.B', '3.B.1', '3.B.1.a', '3.B.2', '3.B.2.a',
-    '3.B.2.b', '3.B.6', '3.B.6.b',
-    '4', '4.A', '4.A.1', '4.A.2', '4.B', '4.C', '4.C.1', '4.D', '4.D.1', '4.D.2',
-    '0', 'M.0.EL', 'M.BK', 'M.BK.A', 'M.BK.M', 'M.BIO']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.1.a",
+    "1.A.1.b",
+    "1.A.2",
+    "1.A.3",
+    "1.A.3.a",
+    "1.A.3.b",
+    "1.A.3.c",
+    "1.A.3.d",
+    "1.A.4",
+    "1.A.5",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "1.B.3",
+    "1.C",
+    "1.C.1",
+    "1.C.2",
+    "1.C.3",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.4.b",
+    "2.A.4.d",
+    "2.B",
+    "2.B.2",
+    "2.B.4",
+    "2.B.8",
+    "2.B.8.a",
+    "2.B.8.c",
+    "2.B.8.e",
+    "2.B.8.f",
+    "2.C",
+    "2.C.1",
+    "2.F",
+    "2.F.1",
+    "2.G",
+    "2.G.1",
+    "2.H",
+    "2.H.1",
+    "2.H.2",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "3.C.1.a",
+    "3.C.1.b",
+    "3.C.1.d",
+    "3.C.2",
+    "3.C.3",
+    "3.C.4",
+    "3.C.5",
+    "3.C.6",
+    "3.C.7",
+    "M.3.C.1.AG",
+    "M.3.C.AG",
+    "M.AG.ELV",
+    "M.LULUCF",
+    "M.3.C.1.LU",
+    "M.3.C.LU",
+    "3.B",
+    "3.B.1",
+    "3.B.1.a",
+    "3.B.2",
+    "3.B.2.a",
+    "3.B.2.b",
+    "3.B.6",
+    "3.B.6.b",
+    "4",
+    "4.A",
+    "4.A.1",
+    "4.A.2",
+    "4.B",
+    "4.C",
+    "4.C.1",
+    "4.D",
+    "4.D.1",
+    "4.D.2",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BK.A",
+    "M.BK.M",
+    "M.BIO",
+]
 
 
 
 
 # gas baskets
 # gas baskets
 gas_baskets = {
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 }

+ 129 - 89
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -1,10 +1,14 @@
-# this script reads data from Thailand's BUR3
-# Data is read from the pdf file
+"""
+Read Thailand's BUR3 from pdf
 
 
+This script reads data from Thailand's BUR3
+Data are read from pdf using camelot
+
+"""
 import camelot
 import camelot
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_tha_bur3 import (
+from config_tha_bur3 import (
     cat_conversion,
     cat_conversion,
     coords_cols,
     coords_cols,
     coords_cols_indirect,
     coords_cols_indirect,
@@ -26,53 +30,65 @@ from .config_tha_bur3 import (
     trend_conf,
     trend_conf,
 )
 )
 
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
+    input_folder = downloaded_data_path / "UNFCCC" / "Thailand" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Thailand"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    inventory_file = 'BUR3_Thailand_251220_.pdf'
-    output_filename = 'THA_BUR3_2020_'
+    inventory_file = "BUR3_Thailand_251220_.pdf"
+    output_filename = "THA_BUR3_2020_"
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
     # inventory tables
     # inventory tables
-    pages_inventory = '68,69'
+    pages_inventory = "68,69"
 
 
     # main sector time series
     # main sector time series
-    page_main_sector_ts = '70'
+    page_main_sector_ts = "70"
 
 
     # indirect gases time series
     # indirect gases time series
-    page_indirect = '72'
-
+    page_indirect = "72"
 
 
     # ###
     # ###
     # read the inventory data and convert to PM2 IF
     # read the inventory data and convert to PM2 IF
     # ###
     # ###
-    tables_inventory = camelot.read_pdf(str(input_folder / inventory_file), pages=pages_inventory,
-                                        split_text=True, flavor="lattice")
+    tables_inventory = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=pages_inventory,
+        split_text=True,
+        flavor="lattice",
+    )
 
 
     df_inventory = tables_inventory[0].df[1:]
     df_inventory = tables_inventory[0].df[1:]
     df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
     df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
 
 
-    df_inventory = pd.concat([df_header, df_inventory, tables_inventory[1].df.iloc[1:]],
-                             axis=0, join='outer')
+    df_inventory = pd.concat(
+        [df_header, df_inventory, tables_inventory[1].df.iloc[1:]], axis=0, join="outer"
+    )
 
 
-    df_inventory = pm2.pm2io.nir_add_unit_information(df_inventory,
-                                                      unit_row=inv_conf["unit_row"],
-                                                      entity_row=inv_conf["entity_row"],
-                                                      regexp_entity=".*", regexp_unit=".*",
-                                                      default_unit="Gg")
+    df_inventory = pm2.pm2io.nir_add_unit_information(
+        df_inventory,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
     # set index and convert to long format
     # set index and convert to long format
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
-    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf["year"],
-                                                         inv_conf["header_long"])
+    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
+        df_inventory, inv_conf["year"], inv_conf["header_long"]
+    )
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
 
 
     # prep for conversion to PM2 IF and native format
     # prep for conversion to PM2 IF and native format
@@ -81,24 +97,29 @@ if __name__ == "__main__":
 
 
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].replace(inv_conf["cat_codes_manual"])
+    df_inventory_long["category"] = df_inventory_long["category"].replace(
+        inv_conf["cat_codes_manual"]
+    )
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
-                                                  regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
     df_inventory_long = df_inventory_long.reset_index(drop=True)
     df_inventory_long = df_inventory_long.reset_index(drop=True)
 
 
     # replace "," with "" in data
     # replace "," with "" in data
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    df_inventory_long.loc[:, "data"] = \
-        df_inventory_long.loc[:, "data"].str.replace(
-            '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.\
-        replace(' ','', regex=False)
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.replace(
+        "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+    )
+    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.replace(
+        " ", "", regex=False
+    )
 
 
     # make sure all col headers are str
     # make sure all col headers are str
     df_inventory_long.columns = df_inventory_long.columns.map(str)
     df_inventory_long.columns = df_inventory_long.columns.map(str)
@@ -108,27 +129,31 @@ if __name__ == "__main__":
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
         df_inventory_long,
         df_inventory_long,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     # ###
     # ###
     # read the main sector time series and convert to PM2 IF
     # read the main sector time series and convert to PM2 IF
     # ###
     # ###
-    tables_main_sector_ts = camelot.read_pdf(str(input_folder / inventory_file), pages=page_main_sector_ts,
-                                        split_text=True, flavor="lattice")
+    tables_main_sector_ts = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=page_main_sector_ts,
+        split_text=True,
+        flavor="lattice",
+    )
 
 
     df_main_sector_ts = tables_main_sector_ts[0].df.iloc[2:]
     df_main_sector_ts = tables_main_sector_ts[0].df.iloc[2:]
-    #df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    #df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
+    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
+    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_main_sector_ts.columns = [trend_conf["header"], trend_conf["unit"]]
     df_main_sector_ts.columns = [trend_conf["header"], trend_conf["unit"]]
 
 
     df_main_sector_ts = df_main_sector_ts.transpose()
     df_main_sector_ts = df_main_sector_ts.transpose()
@@ -141,42 +166,49 @@ if __name__ == "__main__":
 
 
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     df_main_sector_ts["category"] = df_main_sector_ts["category"].replace(
     df_main_sector_ts["category"] = df_main_sector_ts["category"].replace(
-        trend_conf["cat_codes_manual"])
+        trend_conf["cat_codes_manual"]
+    )
 
 
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    year_cols = list(set(df_main_sector_ts.columns) - set(['category', 'unit']))
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    year_cols = list(set(df_main_sector_ts.columns) - set(["category", "unit"]))
     for col in year_cols:
     for col in year_cols:
-        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.\
-            replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.\
-            replace(' ','', regex=False)
+        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.replace(
+            " ", "", regex=False
+        )
 
 
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_main_sector_ts,
         df_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
-
+    )
 
 
     # ###
     # ###
     # read the indirect gases time series and convert to PM2 IF
     # read the indirect gases time series and convert to PM2 IF
     # ###
     # ###
-    tables_indirect = camelot.read_pdf(str(input_folder / inventory_file), pages=page_indirect,
-                                        split_text=True, flavor="lattice")
+    tables_indirect = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=page_indirect,
+        split_text=True,
+        flavor="lattice",
+    )
 
 
     df_indirect = tables_indirect[0].df.iloc[2:]
     df_indirect = tables_indirect[0].df.iloc[2:]
-    #df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    #df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
+    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
+    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_indirect.columns = [ind_conf["header"], ind_conf["unit"]]
     df_indirect.columns = [ind_conf["header"], ind_conf["unit"]]
 
 
     df_indirect = df_indirect.transpose()
     df_indirect = df_indirect.transpose()
@@ -188,29 +220,32 @@ if __name__ == "__main__":
     df_indirect = df_indirect.drop(0)
     df_indirect = df_indirect.drop(0)
     df_indirect = df_indirect.drop(columns=ind_conf["cols_to_remove"])
     df_indirect = df_indirect.drop(columns=ind_conf["cols_to_remove"])
 
 
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    year_cols = list(set(df_indirect.columns) - set(['entity', 'unit']))
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    year_cols = list(set(df_indirect.columns) - set(["entity", "unit"]))
     for col in year_cols:
     for col in year_cols:
-        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.\
-            replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.\
-            replace(' ','', regex=False)
+        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.replace(
+            " ", "", regex=False
+        )
 
 
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_indirect,
         df_indirect,
         coords_cols=coords_cols_indirect,
         coords_cols=coords_cols_indirect,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_indirect,
         coords_defaults=coords_defaults_indirect,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     # ###
     # ###
     # merge the three datasets
     # merge the three datasets
@@ -231,12 +266,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_all_if)
+        data_all_if,
+    )
 
 
     encoding = {var: compression for var in data_all_pm2.data_vars}
     encoding = {var: compression for var in data_all_pm2.data_vars}
     data_all_pm2.pr.to_netcdf(
     data_all_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # ## process the data
     # ## process the data
@@ -244,14 +282,15 @@ if __name__ == "__main__":
     data_proc_pm2 = data_all_pm2
     data_proc_pm2 = data_all_pm2
 
 
     # combine CO2 emissions and removals
     # combine CO2 emissions and removals
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["CO2"].attrs['entity'] = 'CO2'
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
+    data_proc_pm2["CO2"].attrs["entity"] = "CO2"
 
 
     # actual processing
     # actual processing
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         gas_baskets={},
         processing_info_country=country_processing_step1,
         processing_info_country=country_processing_step1,
     )
     )
@@ -261,16 +300,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
     # TODO: processing info is present twice
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -279,9 +318,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 90 - 64
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py

@@ -1,18 +1,27 @@
-# this script reads data from Thailand's BUR3
-# Data is read from two csv files which have been created manually from ocr processed
-# pdf files
-# pdftk Thailand_BUR4_final_28122022.pdf cat 65-67east output inventory_2019.pdf
-# ocrmypdf --force-ocr inventory_2019.pdf inventory_2019_ocr.pdf
-# pdftk Thailand_BUR4_final_28122022.pdf cat 69 output trends.pdf
-# ocrmypdf --force-ocr trends.pdf trends_ocr.pdf
-
-# values for HFCs and SF6 have been taken from Table2-9 where they are present in
-# CO2eq and thus HFC data can be used and SF6 data is not 0 as in the mein inventory
-# tables
+"""
+Read Thailand's BUR4 from pdf
+
+This script reads data from Thailand's BUR4
+Data is read from two csv files which have been created manually from ocr processed
+pdf files
+
+.. code-block:: bash
+
+    pdftk Thailand_BUR4_final_28122022.pdf cat 65-67east output inventory_2019.pdf
+    ocrmypdf --force-ocr inventory_2019.pdf inventory_2019_ocr.pdf
+    pdftk Thailand_BUR4_final_28122022.pdf cat 69 output trends.pdf
+    ocrmypdf --force-ocr trends.pdf trends_ocr.pdf
+
+Values for HFCs and SF6 have been taken from Table2-9 where they are present in
+CO2eq and thus HFC data can be used and SF6 data is not 0 as in the mein inventory
+tables
+
+"""
+
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from .config_tha_bur4 import (
+from config_tha_bur4 import (
     cat_codes_manual_main_sector_ts,
     cat_codes_manual_main_sector_ts,
     cat_conversion,
     cat_conversion,
     coords_cols,
     coords_cols,
@@ -33,36 +42,45 @@ from .config_tha_bur4 import (
     terminology_proc,
     terminology_proc,
 )
 )
 
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     # ###
     # ###
     # configuration
     # configuration
     # ###
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
+    input_folder = downloaded_data_path / "UNFCCC" / "Thailand" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Thailand"
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
 
 
-    inventory_file = 'THA_inventory_2019.csv'
-    trends_file = 'THA_trends_2000-2019.csv'
-    indirect_file = 'THA_indirect_2000-2019.csv'
-    output_filename = 'THA_BUR4_2022_'
+    inventory_file = "THA_inventory_2019.csv"
+    trends_file = "THA_trends_2000-2019.csv"
+    indirect_file = "THA_indirect_2000-2019.csv"
+    output_filename = "THA_BUR4_2022_"
 
 
     compression = dict(zlib=True, complevel=9)
     compression = dict(zlib=True, complevel=9)
 
 
-
     # ###
     # ###
     # read the inventory data and convert to PM2 IF
     # read the inventory data and convert to PM2 IF
     # ###
     # ###
-    df_inventory = pd.read_csv(input_folder /inventory_file, header=None)
+    df_inventory = pd.read_csv(input_folder / inventory_file, header=None)
     df_inventory = pm2.pm2io.nir_add_unit_information(
     df_inventory = pm2.pm2io.nir_add_unit_information(
-        df_inventory, unit_row=inv_conf["unit_row"], entity_row=inv_conf["entity_row"],
-        regexp_entity=".*", regexp_unit=".*", default_unit="Gg")
+        df_inventory,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
     # set index and convert to long format
     # set index and convert to long format
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
-    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf["year"],
-                                                         inv_conf["header_long"])
+    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
+        df_inventory, inv_conf["year"], inv_conf["header_long"]
+    )
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
 
 
     # prep for conversion to PM2 IF and native format
     # prep for conversion to PM2 IF and native format
@@ -71,14 +89,17 @@ if __name__ == "__main__":
 
 
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
     # first the manual replacements
     # first the manual replacements
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].replace(inv_conf["cat_codes_manual"])
+    df_inventory_long["category"] = df_inventory_long["category"].replace(
+        inv_conf["cat_codes_manual"]
+    )
+
     # then the regex replacements
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
-                                                  regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
     df_inventory_long = df_inventory_long.reset_index(drop=True)
     df_inventory_long = df_inventory_long.reset_index(drop=True)
 
 
     # make sure all col headers are str
     # make sure all col headers are str
@@ -89,17 +110,17 @@ if __name__ == "__main__":
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
         df_inventory_long,
         df_inventory_long,
         coords_cols=coords_cols,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     # ###
     # ###
     # read the main sector time series and convert to PM2 IF
     # read the main sector time series and convert to PM2 IF
@@ -115,24 +136,24 @@ if __name__ == "__main__":
     df_main_sector_ts = df_main_sector_ts.drop(0)
     df_main_sector_ts = df_main_sector_ts.drop(0)
 
 
     # replace cat names by codes in col "category"
     # replace cat names by codes in col "category"
-    df_main_sector_ts["category"] = \
-        df_main_sector_ts["category"].replace(cat_codes_manual_main_sector_ts)
+    df_main_sector_ts["category"] = df_main_sector_ts["category"].replace(
+        cat_codes_manual_main_sector_ts
+    )
 
 
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_main_sector_ts,
         df_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
-        time_format='%Y',
-        )
-
+        time_format="%Y",
+    )
 
 
     # ###
     # ###
     # read the indirect gases time series and convert to PM2 IF
     # read the indirect gases time series and convert to PM2 IF
@@ -150,17 +171,17 @@ if __name__ == "__main__":
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_indirect,
         df_indirect,
         coords_cols=coords_cols_indirect,
         coords_cols=coords_cols_indirect,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_indirect,
         coords_defaults=coords_defaults_indirect,
         coords_terminologies=coords_terminologies,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         meta_data=meta_data,
         convert_str=True,
         convert_str=True,
         time_format="%Y",
         time_format="%Y",
-        )
+    )
 
 
     # ###
     # ###
     # merge the three datasets
     # merge the three datasets
@@ -181,12 +202,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_all_if)
+        data_all_if,
+    )
 
 
     encoding = {var: compression for var in data_all_pm2.data_vars}
     encoding = {var: compression for var in data_all_pm2.data_vars}
     data_all_pm2.pr.to_netcdf(
     data_all_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
 
     # ###
     # ###
     # ## process the data
     # ## process the data
@@ -194,14 +218,15 @@ if __name__ == "__main__":
     data_proc_pm2 = data_all_pm2
     data_proc_pm2 = data_all_pm2
 
 
     # combine CO2 emissions and removals
     # combine CO2 emissions and removals
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["CO2"].attrs['entity'] = 'CO2'
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
+    data_proc_pm2["CO2"].attrs["entity"] = "CO2"
 
 
     # actual processing
     # actual processing
     data_proc_pm2 = process_data_for_country(
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         gas_baskets={},
         processing_info_country=country_processing_step1,
         processing_info_country=country_processing_step1,
     )
     )
@@ -211,16 +236,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
     )
 
 
     # adapt source and metadata
     # adapt source and metadata
     # TODO: processing info is present twice
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
 
     # ###
     # ###
     # save data to IF and native format
     # save data to IF and native format
@@ -229,9 +254,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
     if not output_folder.exists():
         output_folder.mkdir()
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 16 - 1
src/unfccc_ghg_data/unfccc_reader/__init__.py

@@ -1 +1,16 @@
-#
+"""Read individual country submissions
+
+The UNFCCC reader contains code to read individual country inventories,
+mostly submitted by non-AnnexI countries to the UNFCCC as Biannial Update Reports (
+BUR), National Communications (NC), and National Inventory Reports (NIR). Code tyo
+read other official country repositories is also included here as it uses the same
+setup.
+
+The code is organized in country folders which contain scripts for each submission
+and configuration files which can also be used for several submissions if the
+configuration is sufficiently similar.
+
+Data are mostly read from pdf files using camelot, but in some cases machine-readable
+files like xlsx are available which we prefer over pdfs.
+
+"""

+ 26 - 19
src/unfccc_ghg_data/unfccc_reader/get_submissions_info.py

@@ -1,19 +1,28 @@
-# helper functions to get information on available submissions
-# and data reading functions for a given country
+"""
+Helper functions for the unfccc_reader
+
+helper functions to get information on available submissions
+and data reading functions for a given country
+"""
 
 
 import json
 import json
 from pathlib import Path
 from pathlib import Path
 
 
-from unfccc_ghg_data.helper import (downloaded_data_path, extracted_data_path,
-                                    get_country_code, root_path)
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    get_country_code,
+    root_path,
+)
 
 
 code_path = root_path / "src" / "unfccc_ghg_data" / "unfccc_reader"
 code_path = root_path / "src" / "unfccc_ghg_data" / "unfccc_reader"
 # TODO: change this to use the code path stored in the helper module
 # TODO: change this to use the code path stored in the helper module
 
 
+
 def get_possible_inputs(
 def get_possible_inputs(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
 ) -> list[Path]:
 ) -> list[Path]:
     """
     """
     For given country name and submission find the possible input files
     For given country name and submission find the possible input files
@@ -71,10 +80,10 @@ def get_possible_inputs(
 
 
 
 
 def get_possible_outputs(
 def get_possible_outputs(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
-)-> list[Path]:
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
+) -> list[Path]:
     """
     """
     For given country name and submission find the possible output files
     For given country name and submission find the possible output files
 
 
@@ -109,11 +118,15 @@ def get_possible_outputs(
             if country_code in folder_mapping:
             if country_code in folder_mapping:
                 country_folder = folder_mapping[country_code]
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping " "json file. Should be str."
+                    )
 
 
                 output_folder = item / country_folder
                 output_folder = item / country_folder
                 if output_folder.exists():
                 if output_folder.exists():
-                    for filepath in output_folder.glob(country_code + "_" + submission + "*"):
+                    for filepath in output_folder.glob(
+                        country_code + "_" + submission + "*"
+                    ):
                         output_files.append(filepath.relative_to(root_path))
                         output_files.append(filepath.relative_to(root_path))
 
 
     if print_info:
     if print_info:
@@ -125,9 +138,3 @@ def get_possible_outputs(
             print("No output files found")
             print("No output files found")
 
 
     return output_files
     return output_files
-
-
-
-
-
-

+ 25 - 14
src/unfccc_ghg_data/unfccc_reader/read_UNFCCC_submission.py

@@ -1,28 +1,34 @@
-# this script takes submission and country as input (from doit) and
-# runs the appropriate script to extract the submission data
+"""
+wrapper to read UNFCCC submission
+
+Take submission and country as input (from doit) and
+run the appropriate script to extract the submission data
+
+"""
+
 
 
 import argparse
 import argparse
 
 
 import datalad.api
 import datalad.api
-from .get_submissions_info import (get_possible_inputs, get_possible_outputs)
 
 
 from unfccc_ghg_data.helper import get_code_file, root_path
 from unfccc_ghg_data.helper import get_code_file, root_path
 
 
+from .get_submissions_info import get_possible_inputs, get_possible_outputs
+
 if __name__ == "__main__":
 if __name__ == "__main__":
     # Find the right function and possible input and output files and
     # Find the right function and possible input and output files and
     # read the data using datalad run.
     # read the data using datalad run.
     parser = argparse.ArgumentParser()
     parser = argparse.ArgumentParser()
-    parser.add_argument('--country', help='Country name or code')
-    parser.add_argument('--submission', help='Submission to read')
+    parser.add_argument("--country", help="Country name or code")
+    parser.add_argument("--submission", help="Submission to read")
 
 
     args = parser.parse_args()
     args = parser.parse_args()
 
 
     country = args.country
     country = args.country
     submission = args.submission
     submission = args.submission
 
 
-
     print(f"Attempting to extract data for {submission} from {country}.")
     print(f"Attempting to extract data for {submission} from {country}.")
-    print("#"*80)
+    print("#" * 80)
     print("")
     print("")
 
 
     # get the correct script
     # get the correct script
@@ -35,8 +41,10 @@ if __name__ == "__main__":
         # get possible input files
         # get possible input files
         input_files = get_possible_inputs(country, submission)
         input_files = get_possible_inputs(country, submission)
         if not input_files:
         if not input_files:
-            print(f"No possible input files found for {country}, {submission}. "
-                  f"Something might be wrong here.")
+            print(
+                f"No possible input files found for {country}, {submission}. "
+                f"Something might be wrong here."
+            )
         else:
         else:
             print("Found the following input_files:")
             print("Found the following input_files:")
             for file in input_files:
             for file in input_files:
@@ -51,8 +59,10 @@ if __name__ == "__main__":
         # get possible output files
         # get possible output files
         output_files = get_possible_outputs(country, submission)
         output_files = get_possible_outputs(country, submission)
         if not output_files:
         if not output_files:
-            print(f"No possible output files found for {country}, {submission}. "
-                  f"This is either the first run or something is wrong.")
+            print(
+                f"No possible output files found for {country}, {submission}. "
+                f"This is either the first run or something is wrong."
+            )
         else:
         else:
             print("Found the following output_files:")
             print("Found the following output_files:")
             for file in output_files:
             for file in output_files:
@@ -74,6 +84,7 @@ if __name__ == "__main__":
     else:
     else:
         # no code found.
         # no code found.
         print(f"No code found to read {submission} from {country}")
         print(f"No code found to read {submission} from {country}")
-        print(f"Use 'doit country_info country={country} to get "
-              f"a list of available submissions and datasets.")
-
+        print(
+            f"Use 'doit country_info country={country} to get "
+            f"a list of available submissions and datasets."
+        )