Jelajahi Sumber

Docstrings and dealing with ruff messages for unfccc_reader (not final as black is fighting ruff)

Johannes Gütschow 1 tahun lalu
induk
melakukan
06a9aceb8e
65 mengubah file dengan 7831 tambahan dan 4903 penghapusan
  1. 4 0
      Makefile
  2. 7 8
      docs/source/conf.py
  3. 15 1
      poetry.lock
  4. 1 0
      pyproject.toml
  5. 8 3
      src/unfccc_ghg_data/__init__.py
  6. 1 1
      src/unfccc_ghg_data/helper/__init__.py
  7. 234 109
      src/unfccc_ghg_data/helper/definitions.py
  8. 6 5
      src/unfccc_ghg_data/helper/folder_mapping.py
  9. 4 6
      src/unfccc_ghg_data/helper/functions.py
  10. 286 176
      src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_core.py
  11. 8 7
      src/unfccc_ghg_data/unfccc_di_reader/read_unfccc_di_for_country.py
  12. 5 8
      src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_bur.py
  13. 3 7
      src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_nc.py
  14. 27 4
      src/unfccc_ghg_data/unfccc_reader/Argentina/__init__.py
  15. 115 96
      src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR4_from_pdf.py
  16. 28 5
      src/unfccc_ghg_data/unfccc_reader/Chile/__init__.py
  17. 288 141
      src/unfccc_ghg_data/unfccc_reader/Chile/config_chl_bur4.py
  18. 90 52
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py
  19. 100 55
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py
  20. 30 1
      src/unfccc_ghg_data/unfccc_reader/Colombia/__init__.py
  21. 104 84
      src/unfccc_ghg_data/unfccc_reader/Colombia/read_COL_BUR3_from_xlsx.py
  22. 30 0
      src/unfccc_ghg_data/unfccc_reader/Indonesia/__init__.py
  23. 167 100
      src/unfccc_ghg_data/unfccc_reader/Indonesia/read_IDN_BUR3_from_pdf.py
  24. 30 0
      src/unfccc_ghg_data/unfccc_reader/Israel/__init__.py
  25. 409 314
      src/unfccc_ghg_data/unfccc_reader/Israel/config_isr_bur2.py
  26. 121 77
      src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py
  27. 30 0
      src/unfccc_ghg_data/unfccc_reader/Malaysia/__init__.py
  28. 922 602
      src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur3.py
  29. 258 253
      src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur4.py
  30. 82 52
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py
  31. 84 55
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py
  32. 30 0
      src/unfccc_ghg_data/unfccc_reader/Mexico/__init__.py
  33. 81 35
      src/unfccc_ghg_data/unfccc_reader/Mexico/config_mex_bur3.py
  34. 63 66
      src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py
  35. 30 0
      src/unfccc_ghg_data/unfccc_reader/Montenegro/__init__.py
  36. 103 47
      src/unfccc_ghg_data/unfccc_reader/Montenegro/config_mne_bur3.py
  37. 88 56
      src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py
  38. 30 0
      src/unfccc_ghg_data/unfccc_reader/Morocco/__init__.py
  39. 187 108
      src/unfccc_ghg_data/unfccc_reader/Morocco/config_mar_bur3.py
  40. 122 88
      src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py
  41. 30 0
      src/unfccc_ghg_data/unfccc_reader/Nigeria/__init__.py
  42. 294 272
      src/unfccc_ghg_data/unfccc_reader/Nigeria/config_nga_bur2.py
  43. 137 103
      src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py
  44. 30 0
      src/unfccc_ghg_data/unfccc_reader/Peru/__init__.py
  45. 77 66
      src/unfccc_ghg_data/unfccc_reader/Peru/config_per_bur3.py
  46. 33 20
      src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py
  47. 30 0
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/__init__.py
  48. 511 403
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/config_kor_bur4.py
  49. 125 76
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py
  50. 140 82
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py
  51. 75 47
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
  52. 30 0
      src/unfccc_ghg_data/unfccc_reader/Singapore/__init__.py
  53. 408 256
      src/unfccc_ghg_data/unfccc_reader/Singapore/config_sgp_bur5.py
  54. 110 72
      src/unfccc_ghg_data/unfccc_reader/Singapore/read_SGP_BUR5_from_pdf.py
  55. 30 0
      src/unfccc_ghg_data/unfccc_reader/Taiwan/__init__.py
  56. 194 120
      src/unfccc_ghg_data/unfccc_reader/Taiwan/config_twn_nir2022.py
  57. 164 104
      src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py
  58. 30 0
      src/unfccc_ghg_data/unfccc_reader/Thailand/__init__.py
  59. 405 223
      src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur3.py
  60. 461 250
      src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur4.py
  61. 129 89
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py
  62. 90 64
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py
  63. 16 1
      src/unfccc_ghg_data/unfccc_reader/__init__.py
  64. 26 19
      src/unfccc_ghg_data/unfccc_reader/get_submissions_info.py
  65. 25 14
      src/unfccc_ghg_data/unfccc_reader/read_UNFCCC_submission.py

+ 4 - 0
Makefile

@@ -40,6 +40,10 @@ black:  ## format the code using black
 ruff-fixes:  ## fix the code using ruff
 	poetry run ruff src tests scripts docs/source/conf.py docs/source/notebooks/*.py --fix
 
+.PHONY: ruff-fixes-current
+ruff-fixes-current:  ## fix the code using ruff
+	poetry run ruff src/unfccc_ghg_data/unfccc_reader --fix
+
 
 .PHONY: test
 test:  ## run the tests

+ 7 - 8
docs/source/conf.py

@@ -4,17 +4,16 @@ Configuration file for the Sphinx documentation builder.
 For the full list of built-in configuration values, see the documentation:
 https://www.sphinx-doc.org/en/master/usage/configuration.html
 """
+import os
 from functools import wraps
+from pathlib import Path
 
 from sphinxcontrib_autodocgen import AutoDocGen
 
-import os
-from pathlib import Path
 os.environ["UNFCCC_GHG_ROOT_PATH"] = str(Path("..") / "..")
 
 import unfccc_ghg_data
 
-
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
@@ -58,7 +57,7 @@ extensions = [
     # math support
     "sphinx.ext.mathjax",
     # execute code
-    # "sphinx_exec_code",
+    "sphinx_exec_code",
 ]
 
 # general sphinx settings
@@ -144,10 +143,10 @@ nb_execution_show_tb = True
 nb_execution_timeout = 120
 nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
 
-# # exec-code config
-# exec_code_working_dir = Path('..') / '..'
-# exec_code_source_folders = [Path('..') / '..' / 'src' / 'unfccc_ghg_data']
-# exec_code_example_dir = '.'
+# exec-code config
+exec_code_working_dir = "."  # Path('..') / '..'
+exec_code_source_folders = [Path("..") / ".." / "src" / "unfccc_ghg_data"]
+exec_code_example_dir = "."
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

+ 15 - 1
poetry.lock

@@ -1068,6 +1068,20 @@ smb = ["smbprotocol"]
 ssh = ["paramiko"]
 tqdm = ["tqdm"]
 
+[[package]]
+name = "ghostscript"
+version = "0.7"
+description = "Interface to the Ghostscript C-API, both high- and low-level, based on ctypes"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ghostscript-0.7-py2.py3-none-any.whl", hash = "sha256:97c70e27ba6b1cab4ab1d9b4cc82d89b8b53e57971f608ded4950b8aa20c78a7"},
+    {file = "ghostscript-0.7.tar.gz", hash = "sha256:b7875a87098740eb0be3de2d9662d15db727305ca9a6d4b7534a3cc33a4b965a"},
+]
+
+[package.dependencies]
+setuptools = ">=38.6.0"
+
 [[package]]
 name = "globalwarmingpotentials"
 version = "0.9.3"
@@ -4375,4 +4389,4 @@ plots = ["matplotlib"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "db0b517e6af6c99b04624df636fc38cdf49b3ec8dd6dce24596da1cf5796c0ac"
+content-hash = "3591f5e1b1134c148b9f68e3861beb4961659d1af5cb4dd7360ef5396a682f2e"

+ 1 - 0
pyproject.toml

@@ -22,6 +22,7 @@ opencv-python = "^4.8.1.78"
 unfccc-di-api = "^4.0.0"
 dask = "^2023.12.0"
 sphinx-exec-code = "^0.10"
+ghostscript = "^0.7"
 
 [tool.poetry.extras]
 plots = ["matplotlib"]

+ 8 - 3
src/unfccc_ghg_data/__init__.py

@@ -6,15 +6,20 @@ different methods from APIs, xlsx and csv files as well as pdf files.
 """
 import importlib.metadata
 
-from . import (helper, unfccc_reader, unfccc_downloader, unfccc_crf_reader,
-               unfccc_di_reader)
+from . import (
+    helper,
+    unfccc_crf_reader,
+    unfccc_di_reader,
+    unfccc_downloader,
+    unfccc_reader,
+)
 
 __all__ = [
     "helper",
     "unfccc_reader",
     "unfccc_crf_reader",
     "unfccc_di_reader",
-    "unfccc_downloader"
+    "unfccc_downloader",
 ]
 
 __version__ = importlib.metadata.version("unfccc_ghg_data")

+ 1 - 1
src/unfccc_ghg_data/helper/__init__.py

@@ -25,10 +25,10 @@ from .functions import (
     convert_categories,
     create_folder_mapping,
     fix_rows,
+    get_code_file,
     get_country_code,
     get_country_name,
     process_data_for_country,
-    get_code_file,
 )
 
 __all__ = [

+ 234 - 109
src/unfccc_ghg_data/helper/definitions.py

@@ -6,14 +6,14 @@ from pathlib import Path
 
 def get_root_path() -> Path:
     """Get the root_path from an environment variable"""
-    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
+    root_path_env = os.getenv("UNFCCC_GHG_ROOT_PATH", None)
     if root_path_env is None:
-        raise ValueError('UNFCCC_GHG_ROOT_PATH environment '
-                         'variable needs to be set') # noqa: TRY003
+        raise ValueError("UNFCCC_GHG_ROOT_PATH environment " "variable needs to be set")
     else:
         root_path = Path(root_path_env).resolve()
     return root_path
 
+
 root_path = get_root_path()
 code_path = root_path / "src" / "unfccc_ghg_data"
 log_path = root_path / "log"
@@ -36,125 +36,250 @@ custom_country_mapping = {
 }
 
 custom_folders = {
-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
-    'Micronesia_(Federated_State_of)': 'FSM',
-    'Micronesia_(Federated_States_of)': 'FSM',
-    'The_Republic_of_North_Macedonia': 'MKD',
-    'Republic_of_Korea': 'KOR',
-    'Bolivia_(Plurinational_State_of)': 'BOL',
-    'Türkiye': 'TUR',
-    'Iran_(Islamic_Republic_of)': 'IRN',
-    'Côte_d`Ivoire': 'CIV',
-    'Democratic_Republic_of_the_Congo': "COD",
-    'European_Union': 'EUA',
-    'Taiwan': 'TWN',
+    "Venezeula_(Bolivarian_Republic_of)": "VEN",
+    "Venezuela_(Bolivarian_Republic_of)": "VEN",
+    "Micronesia_(Federated_State_of)": "FSM",
+    "Micronesia_(Federated_States_of)": "FSM",
+    "The_Republic_of_North_Macedonia": "MKD",
+    "Republic_of_Korea": "KOR",
+    "Bolivia_(Plurinational_State_of)": "BOL",
+    "Türkiye": "TUR",
+    "Iran_(Islamic_Republic_of)": "IRN",
+    "Côte_d`Ivoire": "CIV",
+    "Democratic_Republic_of_the_Congo": "COD",
+    "European_Union": "EUA",
+    "Taiwan": "TWN",
 }
 
 GWP_factors = {
-    'SARGWP100_to_AR4GWP100': {
-        'HFCS': 1.1,
-        'PFCS': 1.1,
-        'UnspMixOfHFCs': 1.1,
-        'UnspMixOfPFCs': 1.1,
-        'FGASES': 1.1,
+    "SARGWP100_to_AR4GWP100": {
+        "HFCS": 1.1,
+        "PFCS": 1.1,
+        "UnspMixOfHFCs": 1.1,
+        "UnspMixOfPFCs": 1.1,
+        "FGASES": 1.1,
     },
-    'SARGWP100_to_AR5GWP100': {
-        'HFCS': 1.2,
-        'PFCS': 1.2,
-        'UnspMixOfHFCs': 1.2,
-        'UnspMixOfPFCs': 1.2,
-        'FGASES': 1.2,
+    "SARGWP100_to_AR5GWP100": {
+        "HFCS": 1.2,
+        "PFCS": 1.2,
+        "UnspMixOfHFCs": 1.2,
+        "UnspMixOfPFCs": 1.2,
+        "FGASES": 1.2,
     },
-    'SARGWP100_to_AR6GWP100': {
-        'HFCS': 1.4,
-        'PFCS': 1.3,
-        'UnspMixOfHFCs': 1.4,
-        'UnspMixOfPFCs': 1.3,
-        'FGASES': 1.35,
+    "SARGWP100_to_AR6GWP100": {
+        "HFCS": 1.4,
+        "PFCS": 1.3,
+        "UnspMixOfHFCs": 1.4,
+        "UnspMixOfPFCs": 1.3,
+        "FGASES": 1.35,
     },
-    'AR4GWP100_to_SARGWP100': {
-        'HFCS': 0.91,
-        'PFCS': 0.91,
-        'UnspMixOfHFCs': 0.91,
-        'UnspMixOfPFCs': 0.91,
-        'FGASES': 0.91,
+    "AR4GWP100_to_SARGWP100": {
+        "HFCS": 0.91,
+        "PFCS": 0.91,
+        "UnspMixOfHFCs": 0.91,
+        "UnspMixOfPFCs": 0.91,
+        "FGASES": 0.91,
     },
-    'AR4GWP100_to_AR5GWP100': {
-        'HFCS': 1.1,
-        'PFCS': 1.1,
-        'UnspMixOfHFCs': 1.1,
-        'UnspMixOfPFCs': 1.1,
-        'FGASES': 1.1,
+    "AR4GWP100_to_AR5GWP100": {
+        "HFCS": 1.1,
+        "PFCS": 1.1,
+        "UnspMixOfHFCs": 1.1,
+        "UnspMixOfPFCs": 1.1,
+        "FGASES": 1.1,
     },
-    'AR4GWP100_to_AR6GWP100': {
-        'HFCS': 1.27,
-        'PFCS': 1.18,
-        'UnspMixOfHFCs': 1.27,
-        'UnspMixOfPFCs': 1.18,
-        'FGASES': 1.23,
+    "AR4GWP100_to_AR6GWP100": {
+        "HFCS": 1.27,
+        "PFCS": 1.18,
+        "UnspMixOfHFCs": 1.27,
+        "UnspMixOfPFCs": 1.18,
+        "FGASES": 1.23,
     },
-    'AR5GWP100_to_SARGWP100': {
-        'HFCS': 0.83,
-        'PFCS': 0.83,
-        'UnspMixOfHFCs': 0.83,
-        'UnspMixOfPFCs': 0.83,
-        'FGASES': 0.83,
+    "AR5GWP100_to_SARGWP100": {
+        "HFCS": 0.83,
+        "PFCS": 0.83,
+        "UnspMixOfHFCs": 0.83,
+        "UnspMixOfPFCs": 0.83,
+        "FGASES": 0.83,
     },
-    'AR5GWP100_to_AR4GWP100': {
-        'HFCS': 0.91,
-        'PFCS': 0.91,
-        'UnspMixOfHFCs': 0.91,
-        'UnspMixOfPFCs': 0.91,
-        'FGASES': 0.91,
+    "AR5GWP100_to_AR4GWP100": {
+        "HFCS": 0.91,
+        "PFCS": 0.91,
+        "UnspMixOfHFCs": 0.91,
+        "UnspMixOfPFCs": 0.91,
+        "FGASES": 0.91,
     },
-    'AR5GWP100_to_AR6GWP100': {
-        'HFCS': 1.17,
-        'PFCS': 1.08,
-        'UnspMixOfHFCs': 1.17,
-        'UnspMixOfPFCs': 1.08,
-        'FGASES': 1.125,
+    "AR5GWP100_to_AR6GWP100": {
+        "HFCS": 1.17,
+        "PFCS": 1.08,
+        "UnspMixOfHFCs": 1.17,
+        "UnspMixOfPFCs": 1.08,
+        "FGASES": 1.125,
     },
 }
 
 gas_baskets = {
-    'HFCS (SARGWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                     'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                     'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                     'HFC407c', 'HFC410a', 'HFC4310mee', #'OTHERHFCS (SARGWP100)',
-                         'UnspMixOfHFCs (SARGWP100)'],
-    'HFCS (AR4GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                     'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                     'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                     'HFC407c', 'HFC410a', 'HFC4310mee', 'UnspMixOfHFCs (AR4GWP100)'],
-    'HFCS (AR5GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                      'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                      'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                      'HFC407c', 'HFC410a', 'HFC4310mee',
-                         'UnspMixOfHFCs (AR5GWP100)'],
-    'HFCS (AR6GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
-                      'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
-                      'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
-                      'HFC407c', 'HFC410a', 'HFC4310mee',
-                         'UnspMixOfHFCs (AR6GWP100)'],
-    'PFCS (SARGWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (SARGWP100)'],
-    'PFCS (AR4GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR4GWP100)'],
-    'PFCS (AR5GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR5GWP100)'],
-    'PFCS (AR6GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',
-                      'UnspMixOfPFCs (AR6GWP100)'],
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (SARGWP100)',
-                          'PFCS (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR4GWP100)',
-                          'PFCS (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR5GWP100)',
-                            'PFCS (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR6GWP100)',
-                            'PFCS (AR6GWP100)'],
+    "HFCS (SARGWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",  #'OTHERHFCS (SARGWP100)',
+        "UnspMixOfHFCs (SARGWP100)",
+    ],
+    "HFCS (AR4GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR4GWP100)",
+    ],
+    "HFCS (AR5GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR5GWP100)",
+    ],
+    "HFCS (AR6GWP100)": [
+        "HFC23",
+        "HFC32",
+        "HFC41",
+        "HFC125",
+        "HFC134",
+        "HFC134a",
+        "HFC143",
+        "HFC143a",
+        "HFC152a",
+        "HFC227ea",
+        "HFC236fa",
+        "HFC245ca",
+        "HFC245fa",
+        "HFC365mfc",
+        "HFC404a",
+        "HFC407c",
+        "HFC410a",
+        "HFC4310mee",
+        "UnspMixOfHFCs (AR6GWP100)",
+    ],
+    "PFCS (SARGWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (SARGWP100)",
+    ],
+    "PFCS (AR4GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR4GWP100)",
+    ],
+    "PFCS (AR5GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR5GWP100)",
+    ],
+    "PFCS (AR6GWP100)": [
+        "C3F8",
+        "C4F10",
+        "CF4",
+        "C2F6",
+        "C6F14",
+        "C5F12",
+        "cC4F8",
+        "UnspMixOfPFCs (AR6GWP100)",
+    ],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (SARGWP100)",
+        "PFCS (SARGWP100)",
+    ],
+    "KYOTOGHG (AR4GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR4GWP100)",
+        "PFCS (AR4GWP100)",
+    ],
+    "KYOTOGHG (AR5GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR5GWP100)",
+        "PFCS (AR5GWP100)",
+    ],
+    "KYOTOGHG (AR6GWP100)": [
+        "CO2",
+        "CH4",
+        "N2O",
+        "SF6",
+        "NF3",
+        "HFCS (AR6GWP100)",
+        "PFCS (AR6GWP100)",
+    ],
 }

+ 6 - 5
src/unfccc_ghg_data/helper/folder_mapping.py

@@ -1,4 +1,4 @@
-""" create mapping of folder to countries
+"""create mapping of folder to countries
 
 this script takes a folder as input (from doit) and
 runs creates the mapping of subfolders to country codes
@@ -13,16 +13,17 @@ if __name__ == "__main__":
     # Find the right function and possible input and output files and
     # read the data using datalad run.
     parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', help='folder name, relative to '
-                                         'repository root folder')
+    parser.add_argument(
+        "--folder", help="folder name, relative to " "repository root folder"
+    )
     args = parser.parse_args()
     folder = args.folder
 
-    if 'extracted_data' in folder:
+    if "extracted_data" in folder:
         extracted = True
     else:
         extracted = False
 
     # print available submissions
-    print("="*10 + f" Creating folder mapping for  {folder} " + "="*10)
+    print("=" * 10 + f" Creating folder mapping for  {folder} " + "=" * 10)
     create_folder_mapping(folder, extracted)

+ 4 - 6
src/unfccc_ghg_data/helper/functions.py

@@ -1,4 +1,4 @@
-""" common functions for unfccc_ghg_data
+"""common functions for unfccc_ghg_data
 
 Functions used by the different readers and downloaders in the unfccc_ghg_data package
 """
@@ -74,8 +74,6 @@ def process_data_for_country(
     xr.Dataset: processed dataset
 
     """
-
-
     # 0: gather information
     countries = list(data_country.coords[data_country.attrs["area"]].values)
     if len(countries) > 1:
@@ -956,9 +954,7 @@ def get_code_file(
                     )
                 else:
                     if print_info:
-                        print(
-                            f"Found code file {file.relative_to(root_path)}"
-                        )
+                        print(f"Found code file {file.relative_to(root_path)}")
                 code_file_path = file
 
     if code_file_path is not None:
@@ -1011,8 +1007,10 @@ def fix_rows(
         new_row = new_row.str.replace("- ", "-")
         # replace spaces in numbers
         pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
+
         def repl(m):
             return f"{m.group('first')}{m.group('last')}"
+
         new_row = new_row.str.replace(pat, repl, regex=True)
         data.loc[indices_to_merge[0]] = new_row
         data = data.drop(indices_to_merge[1:])

+ 286 - 176
src/unfccc_ghg_data/unfccc_crf_reader/unfccc_crf_reader_core.py

@@ -27,13 +27,13 @@ from .util import NoCRFFilesError
 
 ### reading functions
 def convert_crf_table_to_pm2if(
-        df_table: pd.DataFrame,
-        submission_year: int,
-        entity_mapping: Optional[dict[str,str]]=None,
-        coords_defaults_input: Optional[dict[str,str]]=None,
-        filter_remove_input: Optional[dict[str,dict[str,Union[str,list]]]]=None,
-        filter_keep_input: Optional[dict[str,dict[str,Union[str,list]]]]=None,
-        meta_data_input: Optional[dict[str,str]]=None,
+    df_table: pd.DataFrame,
+    submission_year: int,
+    entity_mapping: Optional[dict[str, str]] = None,
+    coords_defaults_input: Optional[dict[str, str]] = None,
+    filter_remove_input: Optional[dict[str, dict[str, Union[str, list]]]] = None,
+    filter_keep_input: Optional[dict[str, dict[str, Union[str, list]]]] = None,
+    meta_data_input: Optional[dict[str, str]] = None,
 ) -> pd.DataFrame:
     """
     Converts a given pandas long format crf table to PRIMAP2 interchange format
@@ -82,7 +82,7 @@ def convert_crf_table_to_pm2if(
     }
 
     add_coords_cols = {
-    #    "orig_cat_name": ["orig_cat_name", "category"],
+        #    "orig_cat_name": ["orig_cat_name", "category"],
     }
 
     coords_terminologies = {
@@ -108,8 +108,8 @@ def convert_crf_table_to_pm2if(
     if entity_mapping is not None:
         coords_value_mapping["entity"] = entity_mapping
 
-    #coords_value_filling_template = {
-    #}
+    # coords_value_filling_template = {
+    # }
 
     filter_remove = {
         "f1": {
@@ -120,13 +120,11 @@ def convert_crf_table_to_pm2if(
         for key in filter_remove_input.keys():
             filter_remove[key] = filter_remove_input[key]
 
-    filter_keep = {
-    }
+    filter_keep = {}
     if filter_keep_input is not None:
         for key in filter_keep_input.keys():
             filter_keep[key] = filter_keep_input[key]
 
-
     meta_data = {
         "references": f"https://unfccc.int/ghg-inventories-annex-i-parties/{submission_year}",
         "rights": "",
@@ -146,7 +144,7 @@ def convert_crf_table_to_pm2if(
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         meta_data=meta_data,
@@ -156,13 +154,13 @@ def convert_crf_table_to_pm2if(
 
 
 def read_crf_table(
-        country_codes: Union[str, list[str]],
-        table: str,
-        submission_year: int,
-        data_year: Optional[Union[int, list[int]]]=None,
-        date: Optional[str]=None,
-        folder: Optional[str]=None,
-        debug: Optional[bool]=False,
+    country_codes: Union[str, list[str]],
+    table: str,
+    submission_year: int,
+    data_year: Optional[Union[int, list[int]]] = None,
+    date: Optional[str] = None,
+    folder: Optional[str] = None,
+    debug: Optional[bool] = False,
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
     """
     Read CRF table for given submission year and country / or countries
@@ -216,13 +214,15 @@ def read_crf_table(
         country_codes = [country_codes]
 
     # get file names and locations
-    input_files = get_crf_files(country_codes=country_codes,
-                                submission_year=submission_year,
-                                data_year=data_year,
-                                date=date,
-                                folder=folder)
+    input_files = get_crf_files(
+        country_codes=country_codes,
+        submission_year=submission_year,
+        data_year=data_year,
+        date=date,
+        folder=folder,
+    )
     # nasty fix for cases where exporting ran overnight and not all files have the same date
-    if (date is not None) and (len(country_codes)==1):
+    if (date is not None) and (len(country_codes) == 1):
         if isinstance(data_year, list):
             expected_files = len(data_year)
         elif isinstance(data_year, int):
@@ -230,17 +230,23 @@ def read_crf_table(
         else:
             expected_files = submission_year - 1990 - 1
         if len(input_files) < expected_files:
-            print(f"Found only {len(input_files)} input files for {country_codes}. "
-                  f"Expected {expected_files}.")
-            print("Possibly exporting run overnight and some files have the previous day as date.")
+            print(
+                f"Found only {len(input_files)} input files for {country_codes}. "
+                f"Expected {expected_files}."
+            )
+            print(
+                "Possibly exporting run overnight and some files have the previous day as date."
+            )
             date_datetime = datetime.strptime(date, "%d%m%Y")
             date_datetime = date_datetime - timedelta(days=1)
             prv_date = date_datetime.strftime("%d%m%Y")
-            more_input_files = get_crf_files(country_codes=country_codes,
-                                             submission_year=submission_year,
-                                             data_year=data_year,
-                                             date=prv_date,
-                                             folder=folder)
+            more_input_files = get_crf_files(
+                country_codes=country_codes,
+                submission_year=submission_year,
+                data_year=data_year,
+                date=prv_date,
+                folder=folder,
+            )
             if len(more_input_files) > 0:
                 print(f"Found {len(more_input_files)} additional input files.")
                 input_files = input_files + more_input_files
@@ -248,11 +254,13 @@ def read_crf_table(
                 print("Found no additional input files")
 
     if input_files == []:
-        raise NoCRFFilesError(f"No files found for {country_codes}, "
-                              f"submission_year={submission_year}, "
-                              f"data_year={data_year}, "
-                              f"date={date}, "
-                              f"folder={folder}.")
+        raise NoCRFFilesError(
+            f"No files found for {country_codes}, "
+            f"submission_year={submission_year}, "
+            f"data_year={data_year}, "
+            f"date={date}, "
+            f"folder={folder}."
+        )
 
     # get specification
     # if we only have a single country check if we might have a country specific
@@ -260,21 +268,25 @@ def read_crf_table(
     if len(country_codes) == 1:
         try:
             crf_spec = getattr(crf, f"CRF{submission_year}_{country_codes[0]}")
-            print(f"Using country specific specification: "
-                  f"CRF{submission_year}_{country_codes[0]}")
+            print(
+                f"Using country specific specification: "
+                f"CRF{submission_year}_{country_codes[0]}"
+            )
         except:
             # no country specific specification, check for general specification
             try:
                 crf_spec = getattr(crf, f"CRF{submission_year}")
             except:
-                raise ValueError(f"No terminology exists for submission year "
-                                 f"{submission_year}")
+                raise ValueError(
+                    f"No terminology exists for submission year " f"{submission_year}"
+                )
     else:
         try:
             crf_spec = getattr(crf, f"CRF{submission_year}")
         except:
-            raise ValueError(f"No terminology exists for submission year "
-                             f"{submission_year}")
+            raise ValueError(
+                f"No terminology exists for submission year " f"{submission_year}"
+            )
 
     # now loop over files and read them
     df_all = None
@@ -284,8 +296,11 @@ def read_crf_table(
         file_info = get_info_from_crf_filename(file.name)
         try:
             int(file_info["data_year"])
-            df_this_file, unknown_rows_this_file, last_row_info_this_file = \
-                read_crf_table_from_file(file, table, crf_spec[table], debug=debug)
+            (
+                df_this_file,
+                unknown_rows_this_file,
+                last_row_info_this_file,
+            ) = read_crf_table_from_file(file, table, crf_spec[table], debug=debug)
             if df_all is None:
                 df_all = df_this_file.copy(deep=True)
                 unknown_rows = unknown_rows_this_file
@@ -301,10 +316,10 @@ def read_crf_table(
 
 
 def read_crf_table_from_file(
-        file: Path,
-        table: str,
-        table_spec: dict[str, dict],
-        debug: Optional[bool]=False,
+    file: Path,
+    table: str,
+    table_spec: dict[str, dict],
+    debug: Optional[bool] = False,
 ) -> tuple[pd.DataFrame, list[list], list[list]]:
     """
     Read a single CRF table from a given file. This is the core function of the CRF
@@ -344,7 +359,6 @@ def read_crf_table_from_file(
             dlds = dl.api.Dataset(root_path)
             dlds.get(file.relative_to(root_path))
 
-
     table_properties = table_spec["table"]
     file_info = get_info_from_crf_filename(file.name)
 
@@ -353,16 +367,23 @@ def read_crf_table_from_file(
     all_cats = [cat[0] for cat in all_cats_mapping]
 
     unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count == 1]
-    unique_cat_tuples = [mapping for mapping in all_cats_mapping if mapping[0] in unique_cats]
-    unique_mapping = dict(zip([tup[0] for tup in unique_cat_tuples],
-                              [tup[1] for tup in unique_cat_tuples]))
+    unique_cat_tuples = [
+        mapping for mapping in all_cats_mapping if mapping[0] in unique_cats
+    ]
+    unique_mapping = dict(
+        zip(
+            [tup[0] for tup in unique_cat_tuples], [tup[1] for tup in unique_cat_tuples]
+        )
+    )
     non_unique_cats = [cat for (cat, count) in Counter(all_cats).items() if count > 1]
 
     # prepare the sector hierarchy
     if non_unique_cats:
         # if we have non-unique categories present we need the information on
         # levels within the category hierarchy
-        category_tree = create_category_tree(all_cats_mapping, table, file_info["party"])
+        category_tree = create_category_tree(
+            all_cats_mapping, table, file_info["party"]
+        )
 
     # prepare index colum information
     cat_col = table_properties["col_for_categories"]
@@ -372,20 +393,37 @@ def read_crf_table_from_file(
     # read the data
     print(f"Reading table {table} for year {file_info['data_year']} from {file.name}.")
     skiprows = table_properties["firstrow"] - 1
-    nrows = table_properties["lastrow"] - skiprows + 1 # read one row more to check if we reached the end
+    nrows = (
+        table_properties["lastrow"] - skiprows + 1
+    )  # read one row more to check if we reached the end
     # we read with user specific NaN treatment as the NaN treatment is part of the conversion to
     # PRIMAP2 format.
-    df_raw = pd.read_excel(file, sheet_name=table, skiprows=skiprows , nrows=nrows, engine="openpyxl",
-                               na_values=['-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN',
-                                          'NULL', 'NaN', ''], keep_default_na=False)
+    df_raw = pd.read_excel(
+        file,
+        sheet_name=table,
+        skiprows=skiprows,
+        nrows=nrows,
+        engine="openpyxl",
+        na_values=[
+            "-1.#IND",
+            "-1.#QNAN",
+            "-NaN",
+            "-nan",
+            "1.#IND",
+            "1.#QNAN",
+            "NULL",
+            "NaN",
+            "",
+        ],
+        keep_default_na=False,
+    )
 
     if len(df_raw) < nrows:
-        #print(f"read data truncated because of all-nan rows")
+        # print(f"read data truncated because of all-nan rows")
         last_row_nan = True
     else:
         last_row_nan = False
 
-
     cols_to_drop = []
     # remove empty first column (for Australia tables start with an empty column)
     # df_raw = df_raw.dropna(how="all", axis=1)
@@ -394,13 +432,14 @@ def read_crf_table_from_file(
     # select only first table by cutting everything after a all-nan column (unless
     # it's the first column)
     if debug:
-        print(f'Header before table end detection: {df_raw.columns.values}')
+        print(f"Header before table end detection: {df_raw.columns.values}")
     for colIdx in range(1, len(df_raw.columns.values)):
-        if ((df_raw.iloc[:, colIdx].isna().all()) &
-                (df_raw.columns[colIdx].startswith('Unnamed'))):
+        if (df_raw.iloc[:, colIdx].isna().all()) & (
+            df_raw.columns[colIdx].startswith("Unnamed")
+        ):
             cols_to_drop = cols_to_drop + list(df_raw.columns.values[colIdx:])
             if debug:
-                print(f'cols_to_drop: {cols_to_drop}')
+                print(f"cols_to_drop: {cols_to_drop}")
             break
 
     if cols_to_drop is not None:
@@ -414,7 +453,7 @@ def read_crf_table_from_file(
     # the filling leads to long and a bit confusing headers, but as long
     # as pandas can not fill values of merged cells in all individual cells
     # we have to use some filling algorithm.
-    df_header = df_raw.iloc[0:len(table_properties["header"])-1].copy(deep=True)
+    df_header = df_raw.iloc[0 : len(table_properties["header"]) - 1].copy(deep=True)
     df_header.loc[-1] = df_header.columns.values
     df_header.index = df_header.index + 1
     # replace "Unnamed: X" colum names by nan to fill from left in next step
@@ -447,15 +486,17 @@ def read_crf_table_from_file(
                         entities[col] = f"{entities[col]} {value}"
 
     if units is None:
-        raise ValueError(f"Specification for table {table} does not contain unit information.")
+        raise ValueError(
+            f"Specification for table {table} does not contain unit information."
+        )
 
     # remove double spaces
     entities = [entity.strip() for entity in entities]
-    entities = [re.sub('\\s+', ' ', entity) for entity in entities]
+    entities = [re.sub("\\s+", " ", entity) for entity in entities]
 
     # replace the old header
     if len(header) > 2:
-        df_current = df_raw.drop(index=df_raw.iloc[0:len(header)-2].index)
+        df_current = df_raw.drop(index=df_raw.iloc[0 : len(header) - 2].index)
     else:
         df_current = df_raw
 
@@ -469,11 +510,11 @@ def read_crf_table_from_file(
     # remove double spaces
     for col in cols_for_space_stripping:
         df_current[col] = df_current[col].str.strip()
-        df_current[col] = df_current[col].replace('\\s+', ' ', regex=True)
+        df_current[col] = df_current[col].replace("\\s+", " ", regex=True)
 
     # prepare for sector mapping by initializing result lists and
     # variables
-    new_cats = [[''] * len(table_properties["categories"])] * len(df_current)
+    new_cats = [[""] * len(table_properties["categories"])] * len(df_current)
 
     # copy the header rows which are not part of the index (unit)
     new_cats[0] = [df_current.iloc[0][cat_col]] * len(table_properties["categories"])
@@ -485,7 +526,9 @@ def read_crf_table_from_file(
     if non_unique_cats:
         # need to initialize the tree parsing.
         last_parent = category_tree.get_node("root")
-        all_nodes = set([category_tree.get_node(node).tag for node in category_tree.nodes])
+        all_nodes = set(
+            [category_tree.get_node(node).tag for node in category_tree.nodes]
+        )
 
         for idx in range(1, len(df_current)):
             current_cat = df_current.iloc[idx][cat_col]
@@ -497,8 +540,12 @@ def read_crf_table_from_file(
                 break
 
             # check if current category is a child of the last node
-            children = dict([[child.tag, child.identifier]
-                        for child in category_tree.children(last_parent.identifier)])
+            children = dict(
+                [
+                    [child.tag, child.identifier]
+                    for child in category_tree.children(last_parent.identifier)
+                ]
+            )
             if current_cat in children.keys():
                 # the current category is a child of the current parent
                 # do the mapping
@@ -517,21 +564,39 @@ def read_crf_table_from_file(
                 if current_cat in all_nodes:
                     old_parent = last_parent
 
-                    while (current_cat not in children.keys()) and \
-                            (last_parent.identifier != "root"):
+                    while (current_cat not in children.keys()) and (
+                        last_parent.identifier != "root"
+                    ):
                         last_parent = category_tree.get_node(
-                            last_parent.predecessor(category_tree.identifier))
-                        children = dict([[child.tag, child.identifier]
-                                    for child in category_tree.children(last_parent.identifier)])
-
-                    if (last_parent.identifier == "root") and \
-                        (current_cat not in children.keys()):
+                            last_parent.predecessor(category_tree.identifier)
+                        )
+                        children = dict(
+                            [
+                                [child.tag, child.identifier]
+                                for child in category_tree.children(
+                                    last_parent.identifier
+                                )
+                            ]
+                        )
+
+                    if (last_parent.identifier == "root") and (
+                        current_cat not in children.keys()
+                    ):
                         # we have not found the category as direct child of any of the
                         # predecessors. Thus it is missing in the specification in
                         # that place
-                        print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, "
-                              f"{file_info['data_year']} (last parent: {old_parent.tag}).")
-                        unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                        print(
+                            f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, "
+                            f"{file_info['data_year']} (last parent: {old_parent.tag})."
+                        )
+                        unknown_categories.append(
+                            [
+                                table,
+                                file_info["party"],
+                                current_cat,
+                                file_info["data_year"],
+                            ]
+                        )
                         # copy back the parent info to continue with next category
                         last_parent = old_parent
                     else:
@@ -543,8 +608,12 @@ def read_crf_table_from_file(
                         if new_children:
                             last_parent = node
                 else:
-                    print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}.")
-                    unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                    print(
+                        f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}."
+                    )
+                    unknown_categories.append(
+                        [table, file_info["party"], current_cat, file_info["data_year"]]
+                    )
     else:
         for idx in range(1, len(df_current)):
             current_cat = df_current.iloc[idx][cat_col]
@@ -557,30 +626,45 @@ def read_crf_table_from_file(
             if current_cat in all_cats:
                 new_cats[idx] = unique_mapping[current_cat]
                 if (idx == len(df_current) - 1) and not last_row_nan:
-                    print(f"found information in last row: category {current_cat}, row {idx}")
-                    info_last_row.append([table, file_info["party"], current_cat, file_info['data_year']])
+                    print(
+                        f"found information in last row: category {current_cat}, row {idx}"
+                    )
+                    info_last_row.append(
+                        [table, file_info["party"], current_cat, file_info["data_year"]]
+                    )
             else:
-                print(f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}.")
-                unknown_categories.append([table, file_info["party"], current_cat, file_info['data_year']])
+                print(
+                    f"Unknown category '{current_cat}' found in {table} for {file_info['party']}, {file_info['data_year']}."
+                )
+                unknown_categories.append(
+                    [table, file_info["party"], current_cat, file_info["data_year"]]
+                )
 
     for idx, col in enumerate(table_properties["categories"]):
-        df_current.insert(loc=idx, column=col, value=
-                          [cat[idx] for cat in new_cats])
+        df_current.insert(loc=idx, column=col, value=[cat[idx] for cat in new_cats])
 
     # set index
     df_current = df_current.set_index(index_cols)
     # process the unit information using the primap2 functions
-    df_current = pm2.pm2io.nir_add_unit_information(df_current, **table_properties["unit_info"])
+    df_current = pm2.pm2io.nir_add_unit_information(
+        df_current, **table_properties["unit_info"]
+    )
 
     # convert to long format
-    header_long = table_properties["categories"] + \
-        ["orig_cat_name", "entity", "unit", "time", "data"]
+    header_long = table_properties["categories"] + [
+        "orig_cat_name",
+        "entity",
+        "unit",
+        "time",
+        "data",
+    ]
     df_long = pm2.pm2io.nir_convert_df_to_long(
-        df_current, file_info["data_year"], header_long=header_long)
+        df_current, file_info["data_year"], header_long=header_long
+    )
 
     # add country information
     df_long.insert(0, column="country", value=file_info["party"])
-    #df_long.insert(1, column="submission", value=f"CRF{file_info['submission_year']}")
+    # df_long.insert(1, column="submission", value=f"CRF{file_info['submission_year']}")
     if "coords_defaults" in table_spec.keys():
         for col in table_spec["coords_defaults"]:
             df_long.insert(2, column=col, value=table_spec["coords_defaults"][col])
@@ -589,18 +673,17 @@ def read_crf_table_from_file(
 
 
 def get_crf_files(
-        country_codes: Union[str, list[str]],
-        submission_year: int,
-        data_year: Optional[Union[int, list[int]]] = None,
-        date: Optional[str] = None,
-        folder: Optional[str] = None,
+    country_codes: Union[str, list[str]],
+    submission_year: int,
+    data_year: Optional[Union[int, list[int]]] = None,
+    date: Optional[str] = None,
+    folder: Optional[str] = None,
 ) -> list[Path]:
     """
     Finds all files according to given parameters
 
     Parameters
     ----------
-
     country_codes: str or list[str]
         ISO 3-letter country code or list of country codes
 
@@ -643,14 +726,20 @@ def get_crf_files(
                 new_country_folders = folder_mapping[country_code]
                 if isinstance(new_country_folders, str):
                     # only one folder
-                    country_folders = [*country_folders, data_folder / new_country_folders / submission_folder]
+                    country_folders = [
+                        *country_folders,
+                        data_folder / new_country_folders / submission_folder,
+                    ]
                 else:
-                    country_folders = country_folders + \
-                                      [data_folder / folder / submission_folder
-                                       for folder in new_country_folders]
+                    country_folders = country_folders + [
+                        data_folder / folder / submission_folder
+                        for folder in new_country_folders
+                    ]
             else:
-                raise ValueError(f"No data folder found for country {country_code}. "
-                                 f"Check if folder mapping is up to date.")
+                raise ValueError(
+                    f"No data folder found for country {country_code}. "
+                    f"Check if folder mapping is up to date."
+                )
     else:
         country_folders = [folder]
 
@@ -671,17 +760,17 @@ def get_crf_files(
                     file_filter["party"] = country
                     dates = get_submission_dates(folder, file_filter)
                     file_filter["date"] = find_latest_date(dates)
-                    input_files = input_files + \
-                                  filter_filenames(input_folder.glob("*.xlsx"),
-                                                   **file_filter)
+                    input_files = input_files + filter_filenames(
+                        input_folder.glob("*.xlsx"), **file_filter
+                    )
             else:
                 file_filter = file_filter_template.copy()
                 if date is not None:
                     file_filter["date"] = date
-                input_files = input_files + \
-                              filter_filenames(input_folder.glob("*.xlsx"),
-                                               **file_filter)
-        #else:
+                input_files = input_files + filter_filenames(
+                    input_folder.glob("*.xlsx"), **file_filter
+                )
+        # else:
         #    raise ValueError(f"Folder {input_folder} does not exist")
     if len(input_files) == 0:
         raise ValueError(f"No input files found in {country_folders}")
@@ -699,7 +788,7 @@ def get_crf_files(
 
 
 def get_info_from_crf_filename(
-        filename: str,
+    filename: str,
 ) -> dict[str, Union[int, str]]:
     """
     Parse given file name and return a dict with information
@@ -707,7 +796,6 @@ def get_info_from_crf_filename(
 
     Parameters
     ----------
-
     filename: str
         The file to analyze (without path)
 
@@ -729,8 +817,7 @@ def get_info_from_crf_filename(
     try:
         file_info["data_year"] = int(name_parts[2])
     except:
-        print(f"Data year string {name_parts[2]} "
-              "could not be converted to int.")
+        print(f"Data year string {name_parts[2]} " "could not be converted to int.")
         file_info["data_year"] = name_parts[2]
     file_info["date"] = name_parts[3]
     # the last part (time code) is missing for Australia since 2023
@@ -742,11 +829,11 @@ def get_info_from_crf_filename(
 
 
 def filter_filenames(
-        files_to_filter: list[Path],
-        party: Optional[Union[str, list[str]]] = None,
-        data_year: Optional[Union[int, list[int]]] = None,
-        submission_year: Optional[str] = None,
-        date: Optional[str] = None,
+    files_to_filter: list[Path],
+    party: Optional[Union[str, list[str]]] = None,
+    data_year: Optional[Union[int, list[int]]] = None,
+    submission_year: Optional[str] = None,
+    date: Optional[str] = None,
 ) -> list[Path]:
     """Filter a list of filenames of CRF files
 
@@ -792,8 +879,8 @@ def filter_filenames(
 
 
 def check_crf_file_info(
-        file_info: dict,
-        file_filter: dict,
+    file_info: dict,
+    file_filter: dict,
 ) -> bool:
     """
     Check if a CRF file has given properties
@@ -837,9 +924,9 @@ def check_crf_file_info(
 
 
 def create_category_tree(
-        specification: list[list],
-        table: str,
-        country: Optional[str] = None,
+    specification: list[list],
+    table: str,
+    country: Optional[str] = None,
 ) -> Tree:
     """
     Create a treelib Tree for the categorical hierarchy from a CRF
@@ -850,7 +937,6 @@ def create_category_tree(
 
     Parameters
     ----------
-
     specification: List[List]
         The `sector_mapping` dict of a table specification
 
@@ -866,8 +952,10 @@ def create_category_tree(
     """
     # small sanity check on the specification
     if len(specification[0]) < 3:
-        raise ValueError(f"Error: Specification for Table {table} has non-unique "
-                         "categories and need level specifications")
+        raise ValueError(
+            f"Error: Specification for Table {table} has non-unique "
+            "categories and need level specifications"
+        )
 
     # initialize variables for tree building
     parent_info = [
@@ -888,11 +976,11 @@ def create_category_tree(
     if country is not None:
         # remove country tags from categories and mark categories
         # for other countries for removal
-        specification = [filter_category(mapping, country)
-                         for mapping in specification]
+        specification = [filter_category(mapping, country) for mapping in specification]
         # remove the categories for other countries
-        specification = [mapping for mapping in specification
-                         if mapping[0] != "\\REMOVE"]
+        specification = [
+            mapping for mapping in specification if mapping[0] != "\\REMOVE"
+        ]
 
     # build a tree from specification
     # when looping over the categories present in the table
@@ -903,7 +991,9 @@ def create_category_tree(
         if current_cat_level == last_cat_info["level"]:
             # cat has the same level as preceeding on, so no change to
             # parent node
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
         elif current_cat_level == last_cat_info["level"] + 1:
             # the current category is one level further away from
             # the trunk of the tree. This means that
@@ -913,23 +1003,29 @@ def create_category_tree(
                 {
                     "id": last_cat_info["id"],
                     "tag": last_cat_info["category"],
-                    "level": last_cat_info["level"]
+                    "level": last_cat_info["level"],
                 }
             )
             # add the category as new node
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
 
         elif current_cat_level < last_cat_info["level"]:
             # the new level is smaller (closer to the trunk)
             # than the last one. Thus we remove all parents
             # from this level on
-            parent_info = parent_info[0: current_cat_level + 1]
-            category_tree.create_node(current_cat, idx, parent=parent_info[-1]["id"], data=mapping)
+            parent_info = parent_info[0 : current_cat_level + 1]
+            category_tree.create_node(
+                current_cat, idx, parent=parent_info[-1]["id"], data=mapping
+            )
         else:
             # increase in levels of more than one is not allowed
-            raise ValueError(f"Error in sector hierarchy for table {table}, category {current_cat}: "
-                             f"Category level is {current_cat_level} and parent level is "
-                             f"{parent_info[-1]['level']}")
+            raise ValueError(
+                f"Error in sector hierarchy for table {table}, category {current_cat}: "
+                f"Category level is {current_cat_level} and parent level is "
+                f"{parent_info[-1]['level']}"
+            )
 
         # set last_cat_info
         last_cat_info["category"] = current_cat
@@ -940,8 +1036,8 @@ def create_category_tree(
 
 
 def filter_category(
-        mapping: list,
-        country: str,
+    mapping: list,
+    country: str,
 ) -> list[str]:
     """
     This function checks if a category mapping is suitable for the given country.
@@ -975,9 +1071,9 @@ def filter_category(
             new_mapping[0] = "\\REMOVE"
         else:
             re_result = re.search(regex_exclude_full, mapping[0])
-            new_mapping[0] = mapping[0][len(re_result.group(1)) + 1:]
+            new_mapping[0] = mapping[0][len(re_result.group(1)) + 1 :]
     elif mapping[0].startswith(string_country):
-        new_mapping[0] = mapping[0][len(string_country) + 1:]
+        new_mapping[0] = mapping[0][len(string_country) + 1 :]
     elif re.match(regex_countries, mapping[0]):
         new_mapping[0] = "\\REMOVE"
 
@@ -985,9 +1081,9 @@ def filter_category(
 
 
 def get_latest_date_for_country(
-        country_code: str,
-        submission_year: int,
-)->str:
+    country_code: str,
+    submission_year: int,
+) -> str:
     """
     Find the latest submission date for a country
 
@@ -1013,26 +1109,36 @@ def get_latest_date_for_country(
         country_folders = folder_mapping[country_code]
         if isinstance(country_folders, str):
             # only one folder
-            submission_date = find_latest_date(get_submission_dates(
-                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
+            submission_date = find_latest_date(
+                get_submission_dates(
+                    downloaded_data_path_UNFCCC
+                    / country_folders
+                    / f"CRF{submission_year}",
+                    file_filter,
+                )
+            )
         else:
             dates = []
             for folder in country_folders:
-                folder_submission = downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}"
+                folder_submission = (
+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}"
+                )
                 if folder_submission.exists():
                     dates = dates + get_submission_dates(folder_submission, file_filter)
             submission_date = find_latest_date(dates)
     else:
-        raise ValueError(f"No data folder found for country {country_code}. "
-                         f"Check if folder mapping is up to date.")
+        raise ValueError(
+            f"No data folder found for country {country_code}. "
+            f"Check if folder mapping is up to date."
+        )
 
     return submission_date
 
 
 def get_submission_dates(
-        folder: Path,
-        file_filter: dict[str, Union[str, int, list]],
-)->list[str]:
+    folder: Path,
+    file_filter: dict[str, Union[str, int, list]],
+) -> list[str]:
     """
     Returns all submission dates available in a folder
 
@@ -1050,8 +1156,10 @@ def get_submission_dates(
             List of dates as str
     """
     if "date" in file_filter:
-        raise ValueError("'date' present in 'file_filter'. This makes no sense as "
-                         "the function's purpose is to return available dates.")
+        raise ValueError(
+            "'date' present in 'file_filter'. This makes no sense as "
+            "the function's purpose is to return available dates."
+        )
 
     if folder.exists():
         files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
@@ -1065,9 +1173,9 @@ def get_submission_dates(
 
 
 def get_submission_parties(
-        folder: Path,
-        file_filter: dict[str, Union[str, int, list]],
-)->list[str]:
+    folder: Path,
+    file_filter: dict[str, Union[str, int, list]],
+) -> list[str]:
     """
     Returns all submission dates available in a folder
 
@@ -1085,8 +1193,10 @@ def get_submission_parties(
             List of parties as str
     """
     if "party" in file_filter:
-        raise ValueError("'party' present in 'file_filter'. This makes no sense as "
-                         "the function's purpose is to return available parties.")
+        raise ValueError(
+            "'party' present in 'file_filter'. This makes no sense as "
+            "the function's purpose is to return available parties."
+        )
 
     if folder.exists():
         files = filter_filenames(list(folder.glob("*.xlsx")), **file_filter)
@@ -1100,9 +1210,9 @@ def get_submission_parties(
 
 
 def find_latest_date(
-        dates: list[str],
-        date_format: str='%d%m%Y',
-)-> str:
+    dates: list[str],
+    date_format: str = "%d%m%Y",
+) -> str:
     """
     Returns the latest date in a list of dates as str in the format
     ddmmyyyy
@@ -1117,11 +1227,11 @@ def find_latest_date(
         str: latest date
     """
     if len(dates) > 0:
-        dates_datetime = [[date, datetime.strptime(date, date_format)] for date in
-                          dates]
+        dates_datetime = [
+            [date, datetime.strptime(date, date_format)] for date in dates
+        ]
         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
     else:
         raise ValueError("Passed list of dates is empty")
 
     return dates_datetime[-1][0]
-

+ 8 - 7
src/unfccc_ghg_data/unfccc_di_reader/read_unfccc_di_for_country.py

@@ -6,12 +6,13 @@ function such that it can be called from datalad
 import argparse
 
 from unfccc_ghg_data.unfccc_di_reader.unfccc_di_reader_core import (
-    read_UNFCCC_DI_for_country)
+    read_UNFCCC_DI_for_country,
+)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--country', help='Country code')
-    parser.add_argument('--date', help='String with current date')
+    parser.add_argument("--country", help="Country code")
+    parser.add_argument("--date", help="String with current date")
     args = parser.parse_args()
 
     country_code = args.country
@@ -19,10 +20,10 @@ if __name__ == "__main__":
 
     read_UNFCCC_DI_for_country(
         country_code=country_code,
-        category_groups=None, # read all categories
-        read_subsectors=False, # not applicable as we read all categories
+        category_groups=None,  # read all categories
+        read_subsectors=False,  # not applicable as we read all categories
         date_str=date_str,
-        pm2if_specifications=None, # automatically use the right specs for AI and NAI
-        default_gwp=None, # automatically uses right default GWP for AI and NAI
+        pm2if_specifications=None,  # automatically use the right specs for AI and NAI
+        default_gwp=None,  # automatically uses right default GWP for AI and NAI
         debug=False,
     )

+ 5 - 8
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_bur.py

@@ -5,7 +5,7 @@ Based on `process_bur` from national-inventory-submissions
 (https://github.com/openclimatedata/national-inventory-submisions)
 """
 
-#import requests
+# import requests
 import re
 import time
 from pathlib import Path
@@ -15,25 +15,24 @@ import pandas as pd
 from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
-from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
-
+from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
 if __name__ == "__main__":
     print("Fetching BUR submissions ...")
 
     url = "https://unfccc.int/BURs"
 
-    #print(url)
+    # print(url)
 
     # set options for headless mode
     profile_path = ".firefox"
     options = Options()
-    options.add_argument('-headless')
+    options.add_argument("-headless")
 
     # create profile for headless mode and automatic downloading
-    options.set_preference('profile', profile_path)
+    options.set_preference("profile", profile_path)
 
     # set up selenium driver
     driver = Firefox(options=options)
@@ -64,7 +63,6 @@ if __name__ == "__main__":
             if str(Path(href).parent).endswith("documents"):
                 targets.append({"title": title, "url": href})
 
-
     pattern = re.compile(r"BUR ?\d")
 
     # Go through sub-pages.
@@ -79,7 +77,6 @@ if __name__ == "__main__":
         else:
             no_downloads.append({target["title"], url})
 
-
     if len(no_downloads) > 0:
         print("No downloads for ", no_downloads)
 

+ 3 - 7
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_nc.py

@@ -19,21 +19,20 @@ from selenium.webdriver.firefox.options import Options
 from unfccc_ghg_data.helper import downloaded_data_path_UNFCCC
 from unfccc_ghg_data.unfccc_downloader import get_unfccc_submission_info
 
-
 if __name__ == "__main__":
     print("Fetching NC submissions ...")
 
     url = "https://unfccc.int/non-annex-I-NCs"
 
-    #print(url)
+    # print(url)
 
     # set options for headless mode
     profile_path = ".firefox"
     options = Options()
-    options.add_argument('-headless')
+    options.add_argument("-headless")
 
     # create profile for headless mode and automatic downloading
-    options.set_preference('profile', profile_path)
+    options.set_preference("profile", profile_path)
 
     # set up selenium driver
     driver = Firefox(options=options)
@@ -64,10 +63,8 @@ if __name__ == "__main__":
             if str(Path(href).parent).endswith("documents"):
                 targets.append({"title": title, "url": href})
 
-
     pattern = re.compile(r"NC ?\d")
 
-
     # Go through sub-pages.
     for target in targets:
         time.sleep(randrange(5, 15))
@@ -80,7 +77,6 @@ if __name__ == "__main__":
         else:
             no_downloads.append({target["title"], url})
 
-
     if len(no_downloads) > 0:
         print("No downloads for ", no_downloads)
 

+ 27 - 4
src/unfccc_ghg_data/unfccc_reader/Argentina/__init__.py

@@ -1,7 +1,30 @@
-"""Argentina (BUR4)
+"""Read Argentina's BURs, NIRs, NCs
 
 Scripts and configurations to read Argentina's submissions to the UNFCCC.
-Currently code for the following submissions is available:
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
 
-* BUR4 (from pdf)
-"""
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'ARG'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=ARG
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 115 - 96
src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR4_from_pdf.py

@@ -2,16 +2,22 @@
 Read Argentina's BUR4 from pdf
 
 This script reads data from Argentina's fourth Binnial Update Report (BUR4).
- Data is read from the pdf file using camelot"""
+Data is read from the pdf file using camelot
+"""
 
+import os
 import sys
 
 import camelot
 import primap2 as pm2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-from unfccc_ghg_data.helper import gas_baskets, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
+)
 
 # ###
 # configuration
@@ -21,53 +27,49 @@ from unfccc_ghg_data.helper import gas_baskets, process_data_for_country
 #  PRIMAP2 version
 if __name__ == "__main__":
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
-                   'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
+    input_folder = downloaded_data_path / "UNFCCC" / "Argentina" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Argentina"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'ARG_BUR4_2022_'
+    output_filename = "ARG_BUR4_2022_"
+
+    pdf_file = "4to_Informe_Bienal_de_la_Rep%C3%BAblica_Argentina.pdf"
 
-    pdf_file = '4to_Informe_Bienal_de_la_Rep%C3%BAblica_Argentina.pdf'
+    # definitions part 1: reading data from pdf and preprocessing for conversion to
+    # PRIMAP2 format
 
-    # definitions part 1: reading data from pdf and preprocessing for conversion to PRIMAP2 format
     # part 1.1 KyotoGHG, CO2, CH4, N2O tables
     #
     pages_to_read = range(232, 244)
     data_start_keyword = "Id#"
     data_end_keyword = "Fuente: Elaboración propia"
-    index_cols = ['Id#', 'Nombre']
-    col_rename = {
-        index_cols[0]: "category",
-        index_cols[1]: "orig_cat_name"
-    }
-    metadata = {
-        "entity": [0, 1],
-        "unit": [0, 2]
-    }
+    index_cols = ["Id#", "Nombre"]
+    col_rename = {index_cols[0]: "category", index_cols[1]: "orig_cat_name"}
+    metadata = {"entity": [0, 1], "unit": [0, 2]}
 
     rows_to_drop = [0]
 
     metadata_mapping = {
-        'unit': {
-            '(GgCO2e)': 'GgCO2e',
-            '(GgCO2)': 'Gg',
-            '(GgN2O)': 'Gg',
-            '(GgCH4)': 'Gg',
-            '(GgGas)': 'Gg',
+        "unit": {
+            "(GgCO2e)": "GgCO2e",
+            "(GgCO2)": "Gg",
+            "(GgN2O)": "Gg",
+            "(GgCH4)": "Gg",
+            "(GgGas)": "Gg",
         }
     }
 
     # part 1.2: fgases table
-    # the f-gases table is in wide format with no sectoral resolution and gases as row header
+    # the f-gases table is in wide format with no sectoral resolution and gases as row
+    # header
     pages_to_read_fgases = range(244, 247)
     data_start_keyword_fgases = "Gas"
-    index_cols_fgases = ['Gas']
+    index_cols_fgases = ["Gas"]
     cols_to_drop_fgases = ["Nombre"]
     metadata_fgases = {
         "unit": [0, 2],
-        "category": '2',
+        "category": "2",
         "orig_cat_name": "PROCESOS INDUSTRIALES Y USO DE PRODUCTOS",
     }
     col_rename_fgases = {
@@ -79,14 +81,14 @@ if __name__ == "__main__":
     cats_remove = ["Information Items", "Memo Items (3)"]
     # manual category codes
     cat_codes_manual = {  # conversion to PRIMAP1 format
-        '1A6': 'MBIO',
-        '1A3di': 'MBKM',
-        '1A3ai': 'MBKA',
-        '1A3di Navegación marítima y fluvial internacional': 'MBKM',
-        'S/N': 'MMULTIOP',
+        "1A6": "MBIO",
+        "1A3di": "MBKM",
+        "1A3ai": "MBKA",
+        "1A3di Navegación marítima y fluvial internacional": "MBKM",
+        "S/N": "MMULTIOP",
     }
 
-    cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,8}).*'
+    cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,8}).*"
 
     time_format = "%Y"
 
@@ -116,32 +118,32 @@ if __name__ == "__main__":
     coords_value_mapping = {
         #    "category": "PRIMAP1",
         "entity": {
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
-            'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'HFC-125': 'HFC125',
-            'HFC-134': 'HFC134',
-            'HFC-134a': 'HFC134a',
-            'HFC-152a': 'HFC152a',
-            'HFC-143': 'HFC143',
-            'HFC-143a': 'HFC143a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-236fa': 'HFC236fa',
-            'HFC-245ca': 'HFC245ca',
-            'HFC-365mfc': 'HFC365mfc',
-            'HFC-245fa': 'HFC245fa',
-            'PFC-143 (CF4)': 'CF4',
-            'PFC-116 (C2F6)': 'C2F6',
-            'PFC-218 (C3F8)': 'C3F8',
-            'PFC-31-10 (C4F10)': 'C4F10',
-            'c-C4F8': 'cC4F8',
-            'PFC-51-144 (C6F14)': 'C6F14',
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
+            "HFC-134": "HFC134",
+            "HFC-134a": "HFC134a",
+            "HFC-152a": "HFC152a",
+            "HFC-143": "HFC143",
+            "HFC-143a": "HFC143a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236fa": "HFC236fa",
+            "HFC-245ca": "HFC245ca",
+            "HFC-365mfc": "HFC365mfc",
+            "HFC-245fa": "HFC245fa",
+            "PFC-143 (CF4)": "CF4",
+            "PFC-116 (C2F6)": "C2F6",
+            "PFC-218 (C3F8)": "C3F8",
+            "PFC-31-10 (C4F10)": "C4F10",
+            "c-C4F8": "cC4F8",
+            "PFC-51-144 (C6F14)": "C6F14",
         },
         "unit": "PRIMAP1",
         "orig_cat_name": {
             "1A3di Navegación marítima y fluvial internacional": "Navegación marítima y fluvial internacional",
-        }
+        },
     }
 
     coords_value_filling = {
@@ -172,7 +174,8 @@ if __name__ == "__main__":
         "references": "https://unfccc.int/documents/419772",
         "rights": "XXXX",
         "contact": "mail@johannes-guetschow.de",
-        "title": "Cuarto Informe Bienal de Actualización de la República Argentina a la Convención Marco delas Naciones Unidas Sobre el Cambio Climático",
+        "title": "Cuarto Informe Bienal de Actualización de la República Argentina a "
+        "la Convención Marco delas Naciones Unidas Sobre el Cambio Climático",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
@@ -192,8 +195,9 @@ if __name__ == "__main__":
     data_all = None
     for page in pages_to_read:
         # read current page
-        tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page),
-                                  flavor='stream')
+        tables = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=str(page), flavor="stream"
+        )
         df_current = tables[0].df
         rows_to_drop = []
         for index, data in df_current.iterrows():
@@ -212,16 +216,18 @@ if __name__ == "__main__":
         df_current = df_current.drop(rows_to_drop)
         idx_header = df_current.index[df_current[0] == index_cols[0]].tolist()
         df_current = df_current.rename(
-            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1)
+            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1
+        )
         df_current = df_current.drop(idx_header)
 
         # for sheet "Aggregate GHGs" fill entity cell
         if page in range(232, 235):
             df_current.iloc[
-                metadata["entity"][0], metadata["entity"][1]] = "KYOTOGHG (SARGWP100)"
+                metadata["entity"][0], metadata["entity"][1]
+            ] = "KYOTOGHG (SARGWP100)"
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # add columns
@@ -242,21 +248,27 @@ if __name__ == "__main__":
         df_current = df_current.drop(df_current.index[0])
 
         # fix number format
-        df_current = df_current.apply(lambda x: x.str.replace('.', '', regex=False), axis=1)
-        df_current = df_current.apply(lambda x: x.str.replace(',', '.', regex=False),
-                                      axis=1)
+        df_current = df_current.apply(
+            lambda x: x.str.replace(".", "", regex=False), axis=1
+        )
+        df_current = df_current.apply(
+            lambda x: x.str.replace(",", ".", regex=False), axis=1
+        )
 
-        df_current.rename(columns=col_rename, inplace=True)
+        df_current = df_current.rename(columns=col_rename)
 
         # reindex
-        df_current.reset_index(inplace=True, drop=True)
+        df_current = df_current.reset_index(drop=True)
 
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
+
         # then the regex replacements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_current["category"] = df_current["category"].str.replace(cat_code_regexp, repl,
-                                                                    regex=True)
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_current["category"] = df_current["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
         df_current = df_current.reset_index(drop=True)
 
@@ -274,7 +286,7 @@ if __name__ == "__main__":
             coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
 
         # convert to PRIMAP2 native format
@@ -289,8 +301,9 @@ if __name__ == "__main__":
     # read fgases
     for page in pages_to_read_fgases:
         # read current page
-        tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page),
-                                  flavor='stream')
+        tables = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=str(page), flavor="stream"
+        )
         df_current = tables[0].df
         rows_to_drop = []
         for index, data in df_current.iterrows():
@@ -309,11 +322,12 @@ if __name__ == "__main__":
         df_current = df_current.drop(rows_to_drop)
         idx_header = df_current.index[df_current[0] == index_cols_fgases[0]].tolist()
         df_current = df_current.rename(
-            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1)
+            dict(zip(df_current.columns, list(df_current.loc[idx_header[0]]))), axis=1
+        )
         df_current = df_current.drop(idx_header)
 
         # drop all rows where the index cols (category code
-        df_current.dropna(axis=0, how='all', subset=index_cols_fgases, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols_fgases)
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # add columns
@@ -324,7 +338,8 @@ if __name__ == "__main__":
                 value = metadata_fgases[col]
             else:
                 value = df_current.iloc[
-                    metadata_fgases[col][0], metadata_fgases[col][1] + inserted]
+                    metadata_fgases[col][0], metadata_fgases[col][1] + inserted
+                ]
                 if col in metadata_mapping.keys():
                     if value in metadata_mapping[col].keys():
                         value = metadata_mapping[col][value]
@@ -339,21 +354,27 @@ if __name__ == "__main__":
         df_current = df_current.drop(df_current.index[0])
 
         # fix number format
-        df_current = df_current.apply(lambda x: x.str.replace('.', '', regex=False), axis=1)
-        df_current = df_current.apply(lambda x: x.str.replace(',', '.', regex=False),
-                                      axis=1)
+        df_current = df_current.apply(
+            lambda x: x.str.replace(".", "", regex=False), axis=1
+        )
+        df_current = df_current.apply(
+            lambda x: x.str.replace(",", ".", regex=False), axis=1
+        )
 
-        df_current.rename(columns=col_rename_fgases, inplace=True)
+        df_current = df_current.rename(columns=col_rename_fgases)
 
         # reindex
-        df_current.reset_index(inplace=True, drop=True)
+        df_current = df_current.reset_index(drop=True)
 
         df_current["category"] = df_current["category"].replace(cat_codes_manual)
-        # then the regex repalcements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_current["category"] = df_current["category"].str.replace(cat_code_regexp, repl,
-                                                                    regex=True)
+
+        # then the regex replacements
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_current["category"] = df_current["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
         df_current = df_current.reset_index(drop=True)
 
@@ -371,7 +392,7 @@ if __name__ == "__main__":
             coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
 
         # convert to PRIMAP2 native format
@@ -390,19 +411,17 @@ if __name__ == "__main__":
         processing_info_country=None,
     )
 
-
     # ###
     # save data to IF and native format
     # ###
 
     encoding = {var: compression for var in data_all.data_vars}
-    data_all.pr.to_netcdf(output_folder / (output_filename + coords_terminologies[
-        "category"] + ".nc"), encoding=encoding)
+    data_all.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     data_if = data_all.pr.to_interchange_format()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
-
-
-
-
-
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )

+ 28 - 5
src/unfccc_ghg_data/unfccc_reader/Chile/__init__.py

@@ -1,7 +1,30 @@
-"""Chile (BUR4, BUR5)
+"""Read Chile's BURs, NIRs, NCs
 
-Scripts and configurations to read Chile's is available:
- * BUR4 (from xlsx)
- * BUR5 (from xlsx)
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
 
-"""
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'CHL'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=CHL
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 288 - 141
src/unfccc_ghg_data/unfccc_reader/Chile/config_chl_bur4.py

@@ -1,166 +1,295 @@
+"""Config for Chile BUR4,5
+
+General configuration for reading the inventory files underlying Chile's BURs 4 and 5.
+PRIMAP2 data for mat specific configuration is BUR specific and not contained here
+but in the reading scripts.
+
+"""
+
 ## parameters for conversion to IPCC2006 categories
 filter_remove_IPCC2006 = {
-    "filter_cats": { # filter cats that have no 1:1 match for IPCC2006 or are additional subsectors
+    "filter_cats": {  # filter cats that have no 1:1 match for IPCC2006 or are additional subsectors
         "category (IPCC2006_PRIMAP)": [
             # refrigeration and air conditioning subsectors don't match IPCC2006
-            '2.F.1.a', '2.F.1.b', '2.F.1.c', '2.F.1.d', '2.F.1.e', '2.F.1.f',
+            "2.F.1.a",
+            "2.F.1.b",
+            "2.F.1.c",
+            "2.F.1.d",
+            "2.F.1.e",
+            "2.F.1.f",
             # additional subsectors for other cattle in enteric fermentation
-            '3.A.1.b.i', '3.A.1.b.ii', '3.A.1.b.iii', '3.A.1.b.iv', '3.A.1.b.v',
+            "3.A.1.b.i",
+            "3.A.1.b.ii",
+            "3.A.1.b.iii",
+            "3.A.1.b.iv",
+            "3.A.1.b.v",
             # additional subcategories for swine in enteric fermentation
-            '3.A.3.a', '3.A.3.b', '3.A.3.c',
+            "3.A.3.a",
+            "3.A.3.b",
+            "3.A.3.c",
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
-            '3.A.4',
+            "3.A.4",
             # need to be summed to '3.A.4.j'
-            '3.A.4.f', '3.A.4.g', '3.A.4.g.i', '3.A.4.g.ii',
+            "3.A.4.f",
+            "3.A.4.g",
+            "3.A.4.g.i",
+            "3.A.4.g.ii",
             # additional subsectors for other cattle in enteric fermentation
-            '3.B.1.b.i', '3.B.1.b.ii', '3.B.1.b.iii', '3.B.1.b.iv', '3.B.1.b.v',
+            "3.B.1.b.i",
+            "3.B.1.b.ii",
+            "3.B.1.b.iii",
+            "3.B.1.b.iv",
+            "3.B.1.b.v",
             # additional subcategories for swine in enteric fermentation
-            '3.B.3.a', '3.B.3.b', '3.B.3.c',
+            "3.B.3.a",
+            "3.B.3.b",
+            "3.B.3.c",
             # other animals in enteric fermentation not fitting the IPCC2006 other animals
-            '3.B.4',
+            "3.B.4",
             # need to be summed to '3.A.4.j'
-            '3.B.4.f', '3.B.4.g', '3.B.4.g.i', '3.B.4.g.ii',
+            "3.B.4.f",
+            "3.B.4.g",
+            "3.B.4.g.i",
+            "3.B.4.g.ii",
             # subsectors of indirect N2O from manure management
-            '3.B.5.a', '3.B.5.b', '3.B.5.c', '3.B.5.d', '3.B.5.d.i', '3.B.5.d.ii',
-            '3.B.5.d.iii', '3.B.5.d.iv', '3.B.5.d.v', '3.B.5.d.vi', '3.B.5.d.vii',
+            "3.B.5.a",
+            "3.B.5.b",
+            "3.B.5.c",
+            "3.B.5.d",
+            "3.B.5.d.i",
+            "3.B.5.d.ii",
+            "3.B.5.d.iii",
+            "3.B.5.d.iv",
+            "3.B.5.d.v",
+            "3.B.5.d.vi",
+            "3.B.5.d.vii",
             # subsectors of rice cultivation
-            '3.C.1', '3.C.2', '3.C.3', '3.C.4',
+            "3.C.1",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
             # no direct represenation of "agricultural soils" in IPCC 2006
-            '3.D',
+            "3.D",
             # subsectors of 3.D.1. not matching subsectors of 3.C.4 (direct emissions from managed soils)
             # '3.D.1.a.': '3.C.1.a', '3.D.1.b.': '3.C.1.b', '3.D.1.c.': '3.A.4.c', '3.D.1.d.': '3.C.4.d',
-            '3.D.1.a', '3.D.1.b', '3.D.1.b.i', '3.D.1.b.ii', '3.D.1.b.iii', '3.D.1.c',
-            '3.D.1.d', '3.D.1.e', '3.D.1.f', '3.D.1.g',
+            "3.D.1.a",
+            "3.D.1.b",
+            "3.D.1.b.i",
+            "3.D.1.b.ii",
+            "3.D.1.b.iii",
+            "3.D.1.c",
+            "3.D.1.d",
+            "3.D.1.e",
+            "3.D.1.f",
+            "3.D.1.g",
             # additional subsector level of 3.D.2.a (3.C.5.a Atmospheric deposition)
-            '3.D.2.a.i', '3.D.2.a.ii', '3.D.2.a.ii.1', '3.D.2.a.ii.2', '3.D.2.a.ii.3', '3.D.2.a.iii',
+            "3.D.2.a.i",
+            "3.D.2.a.ii",
+            "3.D.2.a.ii.1",
+            "3.D.2.a.ii.2",
+            "3.D.2.a.ii.3",
+            "3.D.2.a.iii",
             # additional subsector level of 3.D.2.b (3.C.5.b Nitrongen leaching and runoff)
-            '3.D.2.b.i', '3.D.2.b.ii', '3.D.2.b.ii.1', '3.D.2.b.ii.2', '3.D.2.b.ii.3', '3.D.2.b.iii',
-            '3.D.2.b.iv', '3.D.2.b.v',
+            "3.D.2.b.i",
+            "3.D.2.b.ii",
+            "3.D.2.b.ii.1",
+            "3.D.2.b.ii.2",
+            "3.D.2.b.ii.3",
+            "3.D.2.b.iii",
+            "3.D.2.b.iv",
+            "3.D.2.b.v",
             # additional subsector level of 3.F (3.C.1.b Biomass burning in cropland)
-            '3.F.1', '3.F.2', '3.F.3',
+            "3.F.1",
+            "3.F.2",
+            "3.F.3",
             # additional subsector level of 3.G (3.C.2 Liming)
-            '3.G.1', '3.G.2',
+            "3.G.1",
+            "3.G.2",
             # additional subsector levels of 4.A.1 (3.A.1.a Forest land remaining forest land)
-            '4.A.1.a', '4.A.1.a.i', '4.A.1.a.i.1', '4.A.1.a.i.1.a', '4.A.1.a.i.1.b', '4.A.1.a.i.1.c',
-            '4.A.1.a.i.1.d', '4.A.1.a.i.1.e', '4.A.1.a.i.1.f', '4.A.1.a.i.1.g', '4.A.1.a.i.1.h',
-            '4.A.1.a.i.1.i', '4.A.1.a.i.1.j', '4.A.1.a.i.1.k', '4.A.1.a.i.1.l', '4.A.1.a.i.2',
-            '4.A.1.a.i.2.a', '4.A.1.a.i.2.b', '4.A.1.a.i.2.c', '4.A.1.a.i.2.d', '4.A.1.a.i.2.e',
-            '4.A.1.a.i.2.f', '4.A.1.a.i.2.g', '4.A.1.a.i.2.h', '4.A.1.a.i.2.i', '4.A.1.a.i.2.j',
-            '4.A.1.a.i.2.k', '4.A.1.a.i.2.l', '4.A.1.a.i.3', '4.A.1.a.i.3.a', '4.A.1.a.i.3.b',
-            '4.A.1.a.i.3.c', '4.A.1.a.i.3.d', '4.A.1.a.i.3.e', '4.A.1.a.i.3.f', '4.A.1.a.i.3.g',
-            '4.A.1.a.i.3.h', '4.A.1.a.i.3.i', '4.A.1.a.i.3.j', '4.A.1.a.i.3.k', '4.A.1.a.i.3.l',
-            '4.A.1.a.ii', '4.A.1.a.ii.1', '4.A.1.a.ii.2', '4.A.1.a.ii.3', '4.A.1.a.ii.4',
-            '4.A.1.a.ii.5', '4.A.1.a.ii.6', '4.A.1.a.ii.7', '4.A.1.b', '4.A.1.b.i', '4.A.1.b.i.1',
-            '4.A.1.b.i.2', '4.A.1.b.i.3', '4.A.1.b.i.4', '4.A.1.b.ii', '4.A.1.b.ii.1', '4.A.1.b.ii.2',
-            '4.A.1.b.iii', '4.A.1.b.iii.1', '4.A.1.b.iii.1.a', '4.A.1.b.iii.1.b', '4.A.1.b.iii.2',
-            '4.A.1.b.iv', '4.A.1.c', '4.A.1.c.i', '4.A.1.c.ii',
+            "4.A.1.a",
+            "4.A.1.a.i",
+            "4.A.1.a.i.1",
+            "4.A.1.a.i.1.a",
+            "4.A.1.a.i.1.b",
+            "4.A.1.a.i.1.c",
+            "4.A.1.a.i.1.d",
+            "4.A.1.a.i.1.e",
+            "4.A.1.a.i.1.f",
+            "4.A.1.a.i.1.g",
+            "4.A.1.a.i.1.h",
+            "4.A.1.a.i.1.i",
+            "4.A.1.a.i.1.j",
+            "4.A.1.a.i.1.k",
+            "4.A.1.a.i.1.l",
+            "4.A.1.a.i.2",
+            "4.A.1.a.i.2.a",
+            "4.A.1.a.i.2.b",
+            "4.A.1.a.i.2.c",
+            "4.A.1.a.i.2.d",
+            "4.A.1.a.i.2.e",
+            "4.A.1.a.i.2.f",
+            "4.A.1.a.i.2.g",
+            "4.A.1.a.i.2.h",
+            "4.A.1.a.i.2.i",
+            "4.A.1.a.i.2.j",
+            "4.A.1.a.i.2.k",
+            "4.A.1.a.i.2.l",
+            "4.A.1.a.i.3",
+            "4.A.1.a.i.3.a",
+            "4.A.1.a.i.3.b",
+            "4.A.1.a.i.3.c",
+            "4.A.1.a.i.3.d",
+            "4.A.1.a.i.3.e",
+            "4.A.1.a.i.3.f",
+            "4.A.1.a.i.3.g",
+            "4.A.1.a.i.3.h",
+            "4.A.1.a.i.3.i",
+            "4.A.1.a.i.3.j",
+            "4.A.1.a.i.3.k",
+            "4.A.1.a.i.3.l",
+            "4.A.1.a.ii",
+            "4.A.1.a.ii.1",
+            "4.A.1.a.ii.2",
+            "4.A.1.a.ii.3",
+            "4.A.1.a.ii.4",
+            "4.A.1.a.ii.5",
+            "4.A.1.a.ii.6",
+            "4.A.1.a.ii.7",
+            "4.A.1.b",
+            "4.A.1.b.i",
+            "4.A.1.b.i.1",
+            "4.A.1.b.i.2",
+            "4.A.1.b.i.3",
+            "4.A.1.b.i.4",
+            "4.A.1.b.ii",
+            "4.A.1.b.ii.1",
+            "4.A.1.b.ii.2",
+            "4.A.1.b.iii",
+            "4.A.1.b.iii.1",
+            "4.A.1.b.iii.1.a",
+            "4.A.1.b.iii.1.b",
+            "4.A.1.b.iii.2",
+            "4.A.1.b.iv",
+            "4.A.1.c",
+            "4.A.1.c.i",
+            "4.A.1.c.ii",
             # additional subsector level in land converted to forest land
-            '4.A.2.a.i', '4.A.2.a.ii', '4.A.2.b.i', '4.A.2.b.ii', '4.A.2.c.i', '4.A.2.c.ii',
-            '4.A.2.d.i', '4.A.2.d.ii', '4.A.2.e.i', '4.A.2.e.ii',
+            "4.A.2.a.i",
+            "4.A.2.a.ii",
+            "4.A.2.b.i",
+            "4.A.2.b.ii",
+            "4.A.2.c.i",
+            "4.A.2.c.ii",
+            "4.A.2.d.i",
+            "4.A.2.d.ii",
+            "4.A.2.e.i",
+            "4.A.2.e.ii",
             # subsectors of solid waste disposal might not match
-            '5.A.1', '5.A.2', '5.A.3',
+            "5.A.1",
+            "5.A.2",
+            "5.A.3",
         ],
     },
 }
 
 
-cat_mapping = { # categories not listed here have the same UNFCCC_GHG_data as in IPCC 2006 specifications
-    '3': 'M.AG',
-    '3.A': '3.A.1',
-    '3.A.1': '3.A.1.a',
-    '3.A.1.a': '3.A.1.a.i',
-    '3.A.1.b': '3.A.1.a.ii',
-    '3.A.2': '3.A.1.c',
-    '3.A.3': '3.A.1.h',
-    '3.A.4.a': '3.A.1.b',
-    '3.A.4.b': '3.A.1.d',
-    '3.A.4.c': '3.A.1.f',
-    '3.A.4.d': '3.A.1.g',
-    '3.A.4.e': '3.A.1.i',
-    '3.B': '3.A.2',
-    '3.B.1': '3.A.2.a',
-    '3.B.1.a': '3.A.2.a.i',
-    '3.B.1.b': '3.A.2.a.ii',
-    '3.B.2': '3.A.2.c',
-    '3.B.3': '3.A.2.h',
-    '3.B.4.a': '3.A.2.b',
-    '3.B.4.b': '3.A.2.d',
-    '3.B.4.c': '3.A.2.f',
-    '3.B.4.d': '3.A.2.g',
-    '3.B.4.e': '3.A.2.i',
-    '3.B.5': '3.C.6',
-    '3.C': '3.C.7',
-    '3.D.1': '3.C.4',
-    '3.D.2': '3.C.5',
-    '3.D.2.a': '3.C.5.a', # not in climate_categories
-    '3.D.2.b': '3.C.5.b', # not in climate_categories
-    '3.E': '3.C.1.c',
-    '3.F': '3.C.1.b',
-    '3.G': '3.C.2',
-    '3.H': '3.C.3',
-    '3.I': '3.C.8.a', # merge this with cat below
-    '3.J': '3.C.8.b', # merge with cat above
-    '4': 'M.LULUCF',
-    '4.A': '3.B.1',
-    '4.A.1': '3.B.1.a',
-    '4.A.2': '3.B.1.b',
-    '4.A.2.a': '3.B.1.b.i',
-    '4.A.2.b': '3.B.1.b.ii',
-    '4.A.2.c': '3.B.1.b.iii',
-    '4.A.2.d': '3.B.1.b.iv',
-    '4.A.2.e': '3.B.1.b.v',
-    '4.B': '3.B.2',
-    '4.B.1': '3.B.2.a',
-    '4.B.2': '3.B.2.b',
-    '4.B.2.a': '3.B.2.b.i',
-    '4.B.2.b': '3.B.2.b.ii',
-    '4.B.2.c': '3.B.2.b.iii',
-    '4.B.2.d': '3.B.2.b.iv',
-    '4.B.2.e': '3.B.2.b.v',
-    '4.C': '3.B.3',
-    '4.C.1': '3.B.3.a',
-    '4.C.2': '3.B.3.b',
-    '4.C.2.a': '3.B.3.b.i',
-    '4.C.2.b': '3.B.3.b.ii',
-    '4.C.2.c': '3.B.3.b.iii',
-    '4.C.2.d': '3.B.3.b.iv',
-    '4.C.2.e': '3.B.3.b.v',
-    '4.D': '3.B.4',
-    '4.D.1': '3.B.4.a',
-    '4.D.2': '3.B.4.b',
-    '4.D.2.a': '3.B.4.b.i',
-    '4.D.2.b': '3.B.4.b.ii',
-    '4.D.2.c': '3.B.4.b.iii',
-    '4.D.2.d': '3.B.4.b.iv',
-    '4.D.2.e': '3.B.4.b.v',
-    '4.E': '3.B.5',
-    '4.E.1': '3.B.5.a',
-    '4.E.2': '3.B.5.b',
-    '4.E.2.a': '3.B.5.b.i',
-    '4.E.2.b': '3.B.5.b.ii',
-    '4.E.2.c': '3.B.5.b.iii',
-    '4.E.2.d': '3.B.5.b.iv',
-    '4.E.2.e': '3.B.5.b.v',
-    '4.F': '3.B.6',
-    '4.F.1': '3.B.6.a',
-    '4.F.2': '3.B.6.b',
-    '4.F.2.a': '3.B.6.b.i',
-    '4.F.2.b': '3.B.6.b.ii',
-    '4.F.2.c': '3.B.6.b.iii',
-    '4.F.2.d': '3.B.6.b.iv',
-    '4.F.2.e': '3.B.6.b.v',
-    '4.G': '3.D.1',
-    '4.H': '3.D.2',
-    '5': '4',
-    '5.A': '4.A',
-    '5.B': '4.B',
-    '5.C': '4.C',
-    '5.C.1': '4.C.1',
-    '5.C.2': '4.C.2',
-    '5.D': '4.D',
-    '5.D.1': '4.D.1',
-    '5.D.2': '4.D.2',
-    '5.E': '4.E',
+cat_mapping = {  # categories not listed here have the same UNFCCC_GHG_data as in IPCC 2006 specifications
+    "3": "M.AG",
+    "3.A": "3.A.1",
+    "3.A.1": "3.A.1.a",
+    "3.A.1.a": "3.A.1.a.i",
+    "3.A.1.b": "3.A.1.a.ii",
+    "3.A.2": "3.A.1.c",
+    "3.A.3": "3.A.1.h",
+    "3.A.4.a": "3.A.1.b",
+    "3.A.4.b": "3.A.1.d",
+    "3.A.4.c": "3.A.1.f",
+    "3.A.4.d": "3.A.1.g",
+    "3.A.4.e": "3.A.1.i",
+    "3.B": "3.A.2",
+    "3.B.1": "3.A.2.a",
+    "3.B.1.a": "3.A.2.a.i",
+    "3.B.1.b": "3.A.2.a.ii",
+    "3.B.2": "3.A.2.c",
+    "3.B.3": "3.A.2.h",
+    "3.B.4.a": "3.A.2.b",
+    "3.B.4.b": "3.A.2.d",
+    "3.B.4.c": "3.A.2.f",
+    "3.B.4.d": "3.A.2.g",
+    "3.B.4.e": "3.A.2.i",
+    "3.B.5": "3.C.6",
+    "3.C": "3.C.7",
+    "3.D.1": "3.C.4",
+    "3.D.2": "3.C.5",
+    "3.D.2.a": "3.C.5.a",  # not in climate_categories
+    "3.D.2.b": "3.C.5.b",  # not in climate_categories
+    "3.E": "3.C.1.c",
+    "3.F": "3.C.1.b",
+    "3.G": "3.C.2",
+    "3.H": "3.C.3",
+    "3.I": "3.C.8.a",  # merge this with cat below
+    "3.J": "3.C.8.b",  # merge with cat above
+    "4": "M.LULUCF",
+    "4.A": "3.B.1",
+    "4.A.1": "3.B.1.a",
+    "4.A.2": "3.B.1.b",
+    "4.A.2.a": "3.B.1.b.i",
+    "4.A.2.b": "3.B.1.b.ii",
+    "4.A.2.c": "3.B.1.b.iii",
+    "4.A.2.d": "3.B.1.b.iv",
+    "4.A.2.e": "3.B.1.b.v",
+    "4.B": "3.B.2",
+    "4.B.1": "3.B.2.a",
+    "4.B.2": "3.B.2.b",
+    "4.B.2.a": "3.B.2.b.i",
+    "4.B.2.b": "3.B.2.b.ii",
+    "4.B.2.c": "3.B.2.b.iii",
+    "4.B.2.d": "3.B.2.b.iv",
+    "4.B.2.e": "3.B.2.b.v",
+    "4.C": "3.B.3",
+    "4.C.1": "3.B.3.a",
+    "4.C.2": "3.B.3.b",
+    "4.C.2.a": "3.B.3.b.i",
+    "4.C.2.b": "3.B.3.b.ii",
+    "4.C.2.c": "3.B.3.b.iii",
+    "4.C.2.d": "3.B.3.b.iv",
+    "4.C.2.e": "3.B.3.b.v",
+    "4.D": "3.B.4",
+    "4.D.1": "3.B.4.a",
+    "4.D.2": "3.B.4.b",
+    "4.D.2.a": "3.B.4.b.i",
+    "4.D.2.b": "3.B.4.b.ii",
+    "4.D.2.c": "3.B.4.b.iii",
+    "4.D.2.d": "3.B.4.b.iv",
+    "4.D.2.e": "3.B.4.b.v",
+    "4.E": "3.B.5",
+    "4.E.1": "3.B.5.a",
+    "4.E.2": "3.B.5.b",
+    "4.E.2.a": "3.B.5.b.i",
+    "4.E.2.b": "3.B.5.b.ii",
+    "4.E.2.c": "3.B.5.b.iii",
+    "4.E.2.d": "3.B.5.b.iv",
+    "4.E.2.e": "3.B.5.b.v",
+    "4.F": "3.B.6",
+    "4.F.1": "3.B.6.a",
+    "4.F.2": "3.B.6.b",
+    "4.F.2.a": "3.B.6.b.i",
+    "4.F.2.b": "3.B.6.b.ii",
+    "4.F.2.c": "3.B.6.b.iii",
+    "4.F.2.d": "3.B.6.b.iv",
+    "4.F.2.e": "3.B.6.b.v",
+    "4.G": "3.D.1",
+    "4.H": "3.D.2",
+    "5": "4",
+    "5.A": "4.A",
+    "5.B": "4.B",
+    "5.C": "4.C",
+    "5.C.1": "4.C.1",
+    "5.C.2": "4.C.2",
+    "5.D": "4.D",
+    "5.D.1": "4.D.1",
+    "5.D.2": "4.D.2",
+    "5.E": "4.E",
 }
 
 # comments
@@ -176,11 +305,29 @@ cat_mapping = { # categories not listed here have the same UNFCCC_GHG_data as in
 # '3.A.4.g.ii.',
 
 aggregate_cats = {
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    '3.C.1': {'sources': ['3.C.1.b','3.C.1.c'], 'name': 'Emissions from Biomass Burning'},
-    '3.C.8': {'sources': ['3.C.8.a', '3.C.8.b'], 'name': 'Other'},
-    '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7', '3.C.8'], 'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    '3.D': {'sources': ['3.D.1', '3.D.2'], 'name': 'Other'},
-    '3': {'sources': ['3.A', '3.B', '3.C', '3.D'], 'name': 'AFOLU'},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "3.C.1": {
+        "sources": ["3.C.1.b", "3.C.1.c"],
+        "name": "Emissions from Biomass Burning",
+    },
+    "3.C.8": {"sources": ["3.C.8.a", "3.C.8.b"], "name": "Other"},
+    "3.C": {
+        "sources": [
+            "3.C.1",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
+            "3.C.5",
+            "3.C.6",
+            "3.C.7",
+            "3.C.8",
+        ],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "3.D": {"sources": ["3.D.1", "3.D.2"], "name": "Other"},
+    "3": {"sources": ["3.A", "3.B", "3.C", "3.D"], "name": "AFOLU"},
 }

+ 90 - 52
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py

@@ -2,14 +2,15 @@
 Read Chile's 2020 inventory from xlsx
 
 This script reads data from Chile's 2020 national inventory which is underlying BUR4.
- Data is read from the xlsx file"""
+Data is read from the xlsx file
+"""
 
 import os
 import sys
 
 import pandas as pd
 import primap2 as pm2
-from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -20,42 +21,49 @@ if __name__ == "__main__":
     # ###
 
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
+    input_folder = downloaded_data_path / "UNFCCC" / "Chile" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Chile"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'CHL_BUR4_2021_'
+    output_filename = "CHL_BUR4_2021_"
 
-    inventory_file = 'Inventario_Nacional_de_GEI-1990-2018.xlsx'
+    inventory_file = "Inventario_Nacional_de_GEI-1990-2018.xlsx"
     years_to_read = range(1990, 2018 + 1)
 
     # configuration for conversion to PRIMAP2 data format
     unit_row = "header"
     unit_info = {
-        'regexp_entity': r'(.*)\s\(.*\)$',
-        'regexp_unit': r'.*\s\((.*)\)$',
-        'default_unit': 'kt',
-        'manual_repl_unit': {
-            'kt CO₂ eq': 'ktCO2eq',
-            'HFC (kt CO₂ eq)': 'ktCO2eq',
-            'PFC (kt CO₂ eq)': 'ktCO2eq',
-            'SF₆ (kt CO₂ eq)': 'ktCO2eq',
+        "regexp_entity": r"(.*)\s\(.*\)$",
+        "regexp_unit": r".*\s\((.*)\)$",
+        "default_unit": "kt",
+        "manual_repl_unit": {
+            "kt CO₂ eq": "ktCO2eq",
+            "HFC (kt CO₂ eq)": "ktCO2eq",
+            "PFC (kt CO₂ eq)": "ktCO2eq",
+            "SF₆ (kt CO₂ eq)": "ktCO2eq",
+        },
+        "manual_repl_entity": {
+            "kt CO₂ eq": "KYOTOGHG (AR4GWP100)",
+            "HFC (kt CO₂ eq)": "HFCS (AR4GWP100)",
+            "PFC (kt CO₂ eq)": "PFCS (AR4GWP100)",
+            "SF₆ (kt CO₂ eq)": "SF6 (AR4GWP100)",
         },
-        'manual_repl_entity': {
-            'kt CO₂ eq': 'KYOTOGHG (AR4GWP100)',
-            'HFC (kt CO₂ eq)': 'HFCS (AR4GWP100)',
-            'PFC (kt CO₂ eq)': 'PFCS (AR4GWP100)',
-            'SF₆ (kt CO₂ eq)': 'SF6 (AR4GWP100)',
-        }
     }
-    cols_to_drop = ['Unnamed: 14', 'Unnamed: 16', 'Código IPCC.1',
-                    'Categorías de fuente y sumidero de gases de efecto invernadero.1']
+    cols_to_drop = [
+        "Unnamed: 14",
+        "Unnamed: 16",
+        "Código IPCC.1",
+        "Categorías de fuente y sumidero de gases de efecto invernadero.1",
+    ]
     # columns for category code and original category name
-    index_cols = ['Código IPCC', 'Categorías de fuente y sumidero de gases de efecto invernadero']
+    index_cols = [
+        "Código IPCC",
+        "Categorías de fuente y sumidero de gases de efecto invernadero",
+    ]
 
     # operations on long format DF
-    cols_for_space_stripping = ['category', 'orig_cat_name', 'entity']
+    cols_for_space_stripping = ["category", "orig_cat_name", "entity"]
 
     time_format = "%Y"
 
@@ -85,7 +93,7 @@ if __name__ == "__main__":
         "source": "CHL-GHG-Inventory",
         "provenance": "measured",
         "area": "CHL",
-        "scenario": "BUR4"
+        "scenario": "BUR4",
     }
 
     coords_value_mapping = {
@@ -117,14 +125,14 @@ if __name__ == "__main__":
     }
 
     coords_value_filling = {
-        'category': {  # col to fill
-            'orig_cat_name': {  # col to fill from
-                'Todas las emisiones y las absorciones nacionales': '0',  # from value: to value
-                'Tanque internacional': 'M.BK',
-                'Aviación internacional': 'M.BK.A',
-                'Navegación internacional': 'M.BK.M',
-                'Operaciones multilaterales': 'M.MULTIOP',
-                'Emisiones de CO2 de la biomasa': 'M.BIO',
+        "category": {  # col to fill
+            "orig_cat_name": {  # col to fill from (from value: to value)
+                "Todas las emisiones y las absorciones nacionales": "0",
+                "Tanque internacional": "M.BK",
+                "Aviación internacional": "M.BK.A",
+                "Navegación internacional": "M.BK.M",
+                "Operaciones multilaterales": "M.MULTIOP",
+                "Emisiones de CO2 de la biomasa": "M.BIO",
             }
         }
     }
@@ -141,7 +149,9 @@ if __name__ == "__main__":
     filter_keep = {}
 
     meta_data = {
-        "references": "https://unfccc.int/documents/267936, https://snichile.mma.gob.cl/wp-content/uploads/2021/03/Inventario_Nacional_de_GEI-1990-2018.xlsx",
+        "references": "https://unfccc.int/documents/267936, "
+        "https://snichile.mma.gob.cl/wp-content/uploads/2021/03/"
+        "Inventario_Nacional_de_GEI-1990-2018.xlsx",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
         "title": "Chile: BUR4",
@@ -165,16 +175,24 @@ if __name__ == "__main__":
     for year in years_to_read:
         # read sheet for the year. Each sheet contains several tables,
         # we only read the upper row as the other tables are summary tables
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=str(year), skiprows=2, nrows=442, engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=str(year),
+            skiprows=2,
+            nrows=442,
+            engine="openpyxl",
+        )
         # drop the columns which are empty and repetition of the metadata for the second block
-        df_current.drop(cols_to_drop, axis=1, inplace=True)
+        df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set multi-index. necessary for the stack operation in the conversion to long format
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
-        df_current = pm2.pm2io.nir_add_unit_information(df_current, unit_row=unit_row, **unit_info)
+        df_current = pm2.pm2io.nir_add_unit_information(
+            df_current, unit_row=unit_row, **unit_info
+        )
         # actual conversion to long format
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         # aggregate to one df
@@ -192,7 +210,7 @@ if __name__ == "__main__":
     for col in cols_for_space_stripping:
         df_all[col] = df_all[col].str.strip()
 
-    df_all["category"] = df_all["category"].str.rstrip('.')
+    df_all["category"] = df_all["category"].str.rstrip(".")
 
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
@@ -204,11 +222,11 @@ if __name__ == "__main__":
         coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
-        meta_data=meta_data
+        meta_data=meta_data,
+        time_format="%Y",
     )
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
@@ -216,11 +234,16 @@ if __name__ == "__main__":
     # ###
     # save data to IF and native format
     # ###
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     # ###
     # conversion to ipcc 2006 categories
@@ -236,10 +259,10 @@ if __name__ == "__main__":
         coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
         filter_keep=filter_keep,
-        meta_data=meta_data
+        meta_data=meta_data,
     )
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
 
@@ -252,10 +275,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -263,7 +286,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity', 'unit']).sum()
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -274,12 +305,19 @@ if __name__ == "__main__":
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 100 - 55
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py

@@ -1,12 +1,16 @@
-# this script reads data from Chile's 2020 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Chile's 2022 inventory from xlsx
+
+This script reads data from Chile's 2022 national inventory which is underlying BUR5.
+Data is read from the xlsx file
+"""
 
 import os
 import sys
 
 import pandas as pd
 import primap2 as pm2
-from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -17,43 +21,50 @@ if __name__ == "__main__":
     # ###
 
     # folders and files
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR5'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
+    input_folder = downloaded_data_path / "UNFCCC" / "Chile" / "BUR5"
+    output_folder = extracted_data_path / "UNFCCC" / "Chile"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'CHL_BUR5_2022_'
+    output_filename = "CHL_BUR5_2022_"
 
-    inventory_file = '2022_GEI_CL.xlsx'
+    inventory_file = "2022_GEI_CL.xlsx"
     years_to_read = range(1990, 2020 + 1)
-    time_format='%Y'
+    time_format = "%Y"
 
     # configuration for conversion to PRIMAP2 data format
     unit_row = "header"
     unit_info = {
-        'regexp_entity': r'(.*)\s\(.*\)$',
-        'regexp_unit': r'.*\s\((.*)\)$',
-        'default_unit': 'kt',
-        'manual_repl_unit': {
-            'kt CO₂ eq': 'ktCO2eq',
-            'HFC (kt CO₂ eq)': 'ktCO2eq',
-            'PFC (kt CO₂ eq)': 'ktCO2eq',
-            'SF₆ (kt CO₂ eq)': 'ktCO2eq',
+        "regexp_entity": r"(.*)\s\(.*\)$",
+        "regexp_unit": r".*\s\((.*)\)$",
+        "default_unit": "kt",
+        "manual_repl_unit": {
+            "kt CO₂ eq": "ktCO2eq",
+            "HFC (kt CO₂ eq)": "ktCO2eq",
+            "PFC (kt CO₂ eq)": "ktCO2eq",
+            "SF₆ (kt CO₂ eq)": "ktCO2eq",
+        },
+        "manual_repl_entity": {
+            "kt CO₂ eq": "KYOTOGHG (AR4GWP100)",
+            "HFC (kt CO₂ eq)": "HFCS (AR4GWP100)",
+            "PFC (kt CO₂ eq)": "PFCS (AR4GWP100)",
+            "SF₆ (kt CO₂ eq)": "SF6 (AR4GWP100)",
         },
-        'manual_repl_entity': {
-            'kt CO₂ eq': 'KYOTOGHG (AR4GWP100)',
-            'HFC (kt CO₂ eq)': 'HFCS (AR4GWP100)',
-            'PFC (kt CO₂ eq)': 'PFCS (AR4GWP100)',
-            'SF₆ (kt CO₂ eq)': 'SF6 (AR4GWP100)',
-        }
     }
-    cols_to_drop = ['Unnamed: 14', 'Unnamed: 16', 'Código IPCC.1',
-                    'Categorías de fuente y sumidero de gases de efecto invernadero.1']
+    cols_to_drop = [
+        "Unnamed: 14",
+        "Unnamed: 16",
+        "Código IPCC.1",
+        "Categorías de fuente y sumidero de gases de efecto invernadero.1",
+    ]
     # columns for category code and original category name
-    index_cols = ['Código IPCC', 'Categorías de fuente y sumidero de gases de efecto invernadero']
+    index_cols = [
+        "Código IPCC",
+        "Categorías de fuente y sumidero de gases de efecto invernadero",
+    ]
 
     # operations on long format DF
-    cols_for_space_stripping = ['category', 'orig_cat_name', 'entity']
+    cols_for_space_stripping = ["category", "orig_cat_name", "entity"]
 
     time_format = "%Y"
 
@@ -83,7 +94,7 @@ if __name__ == "__main__":
         "source": "CHL-GHG-Inventory",
         "provenance": "measured",
         "area": "CHL",
-        "scenario": "BUR5"
+        "scenario": "BUR5",
     }
 
     coords_value_mapping = {
@@ -115,14 +126,14 @@ if __name__ == "__main__":
     }
 
     coords_value_filling = {
-        'category': {  # col to fill
-            'orig_cat_name': {  # col to fill from
-                'Todas las emisiones y las absorciones nacionales': '0',  # from value: to value
-                'Tanque internacional': 'M.BK',
-                'Aviación internacional': 'M.BK.A',
-                'Navegación internacional': 'M.BK.M',
-                'Operaciones multilaterales': 'M.MULTIOP',
-                'Emisiones de CO2 de la biomasa': 'M.BIO',
+        "category": {  # col to fill
+            "orig_cat_name": {  # col to fill from (from value: to value)
+                "Todas las emisiones y las absorciones nacionales": "0",
+                "Tanque internacional": "M.BK",
+                "Aviación internacional": "M.BK.A",
+                "Navegación internacional": "M.BK.M",
+                "Operaciones multilaterales": "M.MULTIOP",
+                "Emisiones de CO2 de la biomasa": "M.BIO",
             }
         }
     }
@@ -132,14 +143,19 @@ if __name__ == "__main__":
             "entity": ["Absorciones CO₂", "Emisiones CO₂"],
         },
         "f2": {
-            "orig_cat_name": ["Partidas informativas", "Todas las emisiones nacionales"],
+            "orig_cat_name": [
+                "Partidas informativas",
+                "Todas las emisiones nacionales",
+            ],
         },
     }
 
     filter_keep = {}
 
     meta_data = {
-        "references": "https://unfccc.int/documents/624735, https://snichile.mma.gob.cl/wp-content/uploads/2023/04/2022_GEI_CL.xlsx",
+        "references": "https://unfccc.int/documents/624735, "
+        "https://snichile.mma.gob.cl/wp-content/uploads/2023/04/"
+        "2022_GEI_CL.xlsx",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
         "title": "Chile: BUR5",
@@ -163,16 +179,24 @@ if __name__ == "__main__":
     for year in years_to_read:
         # read sheet for the year. Each sheet contains several tables,
         # we only read the upper row as the other tables are summary tables
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=str(year), skiprows=2, nrows=442, engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=str(year),
+            skiprows=2,
+            nrows=442,
+            engine="openpyxl",
+        )
         # drop the columns which are empty and repetition of the metadata for the second block
-        df_current.drop(cols_to_drop, axis=1, inplace=True)
+        df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set multi-index. necessary for the stack operation in the conversion to long format
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
-        df_current = pm2.pm2io.nir_add_unit_information(df_current, unit_row=unit_row, **unit_info)
+        df_current = pm2.pm2io.nir_add_unit_information(
+            df_current, unit_row=unit_row, **unit_info
+        )
         # actual conversion to long format
         df_current = pm2.pm2io.nir_convert_df_to_long(df_current, year)
         # aggregate to one df
@@ -190,7 +214,7 @@ if __name__ == "__main__":
     for col in cols_for_space_stripping:
         df_all[col] = df_all[col].str.strip()
 
-    df_all["category"] = df_all["category"].str.rstrip('.')
+    df_all["category"] = df_all["category"].str.rstrip(".")
 
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
@@ -206,8 +230,7 @@ if __name__ == "__main__":
         time_format=time_format,
     )
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
@@ -215,11 +238,16 @@ if __name__ == "__main__":
     # ###
     # save data to IF and native format
     # ###
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     # ###
     # conversion to ipcc 2006 categories
@@ -236,10 +264,10 @@ if __name__ == "__main__":
         filter_remove=filter_remove,
         filter_keep=filter_keep,
         meta_data=meta_data,
-        time_format=time_format
+        time_format=time_format,
     )
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
 
@@ -252,10 +280,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -263,8 +291,18 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity', 'unit']).sum()
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+            df_combine = df_combine.drop(
+                columns=["category (IPCC2006_PRIMAP)", "orig_cat_name"]
+            )
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -275,12 +313,19 @@ if __name__ == "__main__":
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 1
src/unfccc_ghg_data/unfccc_reader/Colombia/__init__.py

@@ -1 +1,30 @@
-"""Code to read Colombia's submissions"""
+"""Read Colombia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'COL'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=COL
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 104 - 84
src/unfccc_ghg_data/unfccc_reader/Colombia/read_COL_BUR3_from_xlsx.py

@@ -1,6 +1,10 @@
-# this script reads data from Colombia's BUR3
-# Data is read from the xlsx file which has been exported from the google docs
-# spreadsheet which is linked in the BUR
+"""
+Read Colombia's 2020 inventory from xlsx
+
+this script reads data from Colombia's BUR3
+Data is read from the xlsx file which has been exported from the google docs
+spreadsheet which is linked in the BUR
+"""
 
 import pandas as pd
 import primap2 as pm2
@@ -14,17 +18,17 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Colombia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Colombia"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'COL_BUR3_2022_'
+    output_filename = "COL_BUR3_2022_"
 
-    inventory_file = 'TR_1990-2018_BUR3-AR5_VF.xlsx'
+    inventory_file = "TR_1990-2018_BUR3-AR5_VF.xlsx"
     years_to_read = range(1990, 2018 + 1)
 
-    sheet_to_read = 'TR 1990-2018'
+    sheet_to_read = "TR 1990-2018"
     cols_to_read = range(0, 47)
 
     compression = dict(zlib=True, complevel=9)
@@ -37,7 +41,6 @@ if __name__ == "__main__":
         "unit": "unit",
     }
 
-
     coords_terminologies = {
         "area": "ISO3",
         "category": "IPCC2006",
@@ -54,53 +57,52 @@ if __name__ == "__main__":
     coords_value_mapping = {
         "unit": "PRIMAP1",
         "entity": {
-            'Absorciones CO2': 'CO2 Absorptions',
-            'Emisiones CO2': 'CO2 Emissions',
-            'Emisiones netas (AR5GWP100)': 'KYOTOGHG (AR5GWP100)',
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
+            "Absorciones CO2": "CO2 Absorptions",
+            "Emisiones CO2": "CO2 Emissions",
+            "Emisiones netas (AR5GWP100)": "KYOTOGHG (AR5GWP100)",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
             #'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'HFC-125': 'HFC125',
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
             #'HFC-134': 'HFC134',
-            'HFC-134a': 'HFC134a',
-            'HFC-152a': 'HFC152a',
+            "HFC-134a": "HFC134a",
+            "HFC-152a": "HFC152a",
             #'HFC-143': 'HFC143',
-            'HFC-143a': 'HFC143a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-236fa': 'HFC236fa',
+            "HFC-143a": "HFC143a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236fa": "HFC236fa",
             #'HFC-245ca': 'HFC245ca',
-            'HFC-245fa': 'HFC245fa',
-            'HFC-365mfc': 'HFC365mfc',
-            'PFC-116': 'C2F6',
-            'PFC-14': 'CF4',
+            "HFC-245fa": "HFC245fa",
+            "HFC-365mfc": "HFC365mfc",
+            "PFC-116": "C2F6",
+            "PFC-14": "CF4",
         },
     }
 
-
     filter_remove = {
         "fGWP": {
             "entity": [
-                'Absorciones CO2 (AR5GWP100)',
-                'Absorciones totales (AR5GWP100)',
-                'CH4 (AR5GWP100)',
-                'Emisiones CO2 (AR5GWP100)',
-                'Total emisiones (AR5GWP100)',
-                'HFC-125 (AR5GWP100)',
-                'HFC-134a (AR5GWP100)',
-                'HFC-143a (AR5GWP100)',
-                'HFC-152a (AR5GWP100)',
-                'HFC-227ea (AR5GWP100)',
-                'HFC-23 (AR5GWP100)',
-                'HFC-236fa (AR5GWP100)',
-                'HFC-245fa (AR5GWP100)',
-                'HFC-32 (AR5GWP100)',
-                'HFC-365mfc (AR5GWP100)',
-                'HFC-43-10mee (AR5GWP100)',
-                'N2O (AR5GWP100)',
-                'PFC-116 (AR5GWP100)',
-                'PFC-14 (AR5GWP100)',
-                'SF6 (AR5GWP100)',
+                "Absorciones CO2 (AR5GWP100)",
+                "Absorciones totales (AR5GWP100)",
+                "CH4 (AR5GWP100)",
+                "Emisiones CO2 (AR5GWP100)",
+                "Total emisiones (AR5GWP100)",
+                "HFC-125 (AR5GWP100)",
+                "HFC-134a (AR5GWP100)",
+                "HFC-143a (AR5GWP100)",
+                "HFC-152a (AR5GWP100)",
+                "HFC-227ea (AR5GWP100)",
+                "HFC-23 (AR5GWP100)",
+                "HFC-236fa (AR5GWP100)",
+                "HFC-245fa (AR5GWP100)",
+                "HFC-32 (AR5GWP100)",
+                "HFC-365mfc (AR5GWP100)",
+                "HFC-43-10mee (AR5GWP100)",
+                "N2O (AR5GWP100)",
+                "PFC-116 (AR5GWP100)",
+                "PFC-14 (AR5GWP100)",
+                "SF6 (AR5GWP100)",
             ],
         },
     }
@@ -116,25 +118,33 @@ if __name__ == "__main__":
         "institution": "UNFCCC",
     }
 
-
     # read the data
-    data_raw = pd.read_excel(input_folder / inventory_file, sheet_name=sheet_to_read,
-                             skiprows=0, nrows=15025, usecols=cols_to_read,
-                             engine="openpyxl", header=None)
+    data_raw = pd.read_excel(
+        input_folder / inventory_file,
+        sheet_name=sheet_to_read,
+        skiprows=0,
+        nrows=15025,
+        usecols=cols_to_read,
+        engine="openpyxl",
+        header=None,
+    )
 
     # fill the units to the right as for merged cells the unit is only in the first cell
-    data_raw.iloc[unit_row] = data_raw.iloc[unit_row].fillna(axis=0, method="ffill")
+    data_raw.iloc[unit_row] = data_raw.iloc[unit_row].ffill(axis=0)
     merge_rows = [1, 2]
     for row in merge_rows:
         data_raw.iloc[row] = data_raw.iloc[row].astype(str).str.replace("nan", "")
     data_raw.iloc[merge_rows[0]] = (
-    data_raw.iloc[merge_rows[0]].astype(str) + " " + data_raw.iloc[
-            merge_rows[1]].astype(str))
+        data_raw.iloc[merge_rows[0]].astype(str)
+        + " "
+        + data_raw.iloc[merge_rows[1]].astype(str)
+    )
     data_raw.iloc[merge_rows[0]] = data_raw.iloc[merge_rows[0]].str.strip()
     data_raw = data_raw.drop(index=data_raw.index[merge_rows[1]])
 
     # merge the category cols
     def join_code_parts(series):
+        """Create a code from the data in the individual columns"""
         code = series.iloc[0]
         for part in series.iloc[1:]:
             if part != "nan":
@@ -143,10 +153,11 @@ if __name__ == "__main__":
             code = "0"
         return code
 
-    cat_columns = [0, 1, 2, 3, 4, 5] # xlsx cols are ["MOD","CAP","CAT","SCAT","NROM",
+    cat_columns = [0, 1, 2, 3, 4, 5]  # xlsx cols are ["MOD","CAP","CAT","SCAT","NROM",
     # "NUM"]
-    data_raw["category"] = data_raw[cat_columns].astype(str).agg(func=join_code_parts,
-                                                                 axis=1)
+    data_raw["category"] = (
+        data_raw[cat_columns].astype(str).agg(func=join_code_parts, axis=1)
+    )
     data_raw = data_raw.drop(columns=cat_columns)
 
     # prepare the dataframe for processig with primap2 functions
@@ -162,27 +173,29 @@ if __name__ == "__main__":
     for year in years:
         data_year = data_raw[data_raw["ANO"] == year]
         data_year = data_year.drop(columns=["ANO", "Categorías de fuente y sumideros"])
-        df_long_new = pm2.pm2io.nir_convert_df_to_long(data_year, year,
-                                                       ["category", "unit", "entity",
-                                                        "time", "data"])
+        df_long_new = pm2.pm2io.nir_convert_df_to_long(
+            data_year, year, ["category", "unit", "entity", "time", "data"]
+        )
         if df_all is None:
             df_all = df_long_new
         else:
-            df_all = pd.concat([df_all, df_long_new], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_long_new], axis=0, join="outer")
 
     df_all["category"] = df_all["category"].str[0]
 
     # map units
-    df_all["unit"] = df_all["unit"].replace({
-        'GEI DIRECTOS - Gg ': 'Gg',
-        'GEI DIRECTOS - Gg CO2 equivalente': 'GgCO2eq',
-    }
+    df_all["unit"] = df_all["unit"].replace(
+        {
+            "GEI DIRECTOS - Gg ": "Gg",
+            "GEI DIRECTOS - Gg CO2 equivalente": "GgCO2eq",
+        }
     )
 
     # add GWP information to entity
     for entity in df_all["entity"].unique():
-        df_all["entity"][(df_all["entity"] == entity) & (
-                    df_all["unit"] == "GgCO2eq")] = f"{entity} (AR5GWP100)"
+        df_all["entity"][
+            (df_all["entity"] == entity) & (df_all["unit"] == "GgCO2eq")
+        ] = f"{entity} (AR5GWP100)"
 
     # reset index before conversion to pm2 IF
     df_all = df_all.reset_index(drop=True)
@@ -196,26 +209,25 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_long_dataframe_if(
         df_all,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
-        convert_str=True
-        )
-
+        convert_str=True,
+        time_format="%Y",
+    )
 
     # combine CO2 emissions and absorptions
-    data_CO2 = data_if[data_if["entity"].isin([
-        'CO2 Absorptions', 'CO2 Emissions'])]
+    data_CO2 = data_if[data_if["entity"].isin(["CO2 Absorptions", "CO2 Emissions"])]
 
-    time_format = '%Y'
+    time_format = "%Y"
     time_columns = [
         col
-        for col in data_CO2.columns.values
+        for col in data_CO2.columns.to_numpy()
         if matches_time_format(col, time_format)
     ]
 
@@ -223,20 +235,23 @@ if __name__ == "__main__":
         data_CO2[col] = pd.to_numeric(data_CO2[col], errors="coerce")
 
     data_CO2 = data_CO2.groupby(
-        by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)',
+        by=[
+            "source",
+            "scenario (PRIMAP)",
+            "provenance",
+            "area (ISO3)",
             f"category ({coords_terminologies['category']})",
-            'unit']).sum(min_count = 1)
+            "unit",
+        ]
+    ).sum(min_count=1)
 
-    data_CO2.insert(0, 'entity', 'CO2')
+    data_CO2.insert(0, "entity", "CO2")
     data_CO2 = data_CO2.reset_index()
 
     data_if = pd.concat([data_if, data_CO2])
 
-
-
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
-
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
 
@@ -245,7 +260,12 @@ if __name__ == "__main__":
     # ###
     if not output_folder.exists():
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Indonesia/__init__.py

@@ -0,0 +1,30 @@
+"""Read Indonesia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'IDN'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=IDN
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 167 - 100
src/unfccc_ghg_data/unfccc_reader/Indonesia/read_IDN_BUR3_from_pdf.py

@@ -1,6 +1,11 @@
-# this script reads data from Indonesia's BUR3
-# Data is read from pdf
-# only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+"""
+Read Indonesia's BUR3 from pdf
+
+This script reads data from Indonesia's BUR3
+Data are read from pdf using camelot
+only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+
+"""
 
 import camelot
 import numpy as np
@@ -14,18 +19,19 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Indonesia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Indonesia"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'IDN_BUR3_2021_'
+    output_filename = "IDN_BUR3_2021_"
 
-    inventory_file = 'IndonesiaBUR_3_FINAL_REPORT_2.pdf'
+    inventory_file = "IndonesiaBUR_3_FINAL_REPORT_2.pdf"
 
-    gwp_to_use = 'SARGWP100'
+    gwp_to_use = "SARGWP100"
 
-    pages_to_read = range(61,65) # 65 is not read properly but contains almost no data anyway, so add it by hand '61-65'
+    pages_to_read = range(61, 65)  # 65 is not read properly but contains almost no
+    # data anyway, so add it by hand
 
     compression = dict(zlib=True, complevel=9)
 
@@ -36,17 +42,18 @@ if __name__ == "__main__":
     # special header as category code and name in one column
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
+    time_format = "%Y"
 
     # manual category codes
     cat_codes_manual = {
-        'Total National Emissions and Removals': '0',
-        'Peat Decomposition': 'M.3.B.4.APD',
-        'Peat Fire': 'M.3.B.4.APF',
-        '4A1.2 Industrial Solid Waste Disposal': 'M.4.A.Ind',
+        "Total National Emissions and Removals": "0",
+        "Peat Decomposition": "M.3.B.4.APD",
+        "Peat Fire": "M.3.B.4.APF",
+        "4A1.2 Industrial Solid Waste Disposal": "M.4.A.Ind",
         #'3A2b Direct N2O Emissions from Manure Management': '3.A.2',
     }
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9]{1,4})\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9]{1,4})\s.*"
 
     coords_cols = {
         "category": "category",
@@ -75,24 +82,26 @@ if __name__ == "__main__":
         "unit": "PRIMAP1",
         "category": "PRIMAP1",
         "entity": {
-            'Total 3 Gases': f"CO2CH4N2O ({gwp_to_use})",
-            'Net CO2 (1) (2)': 'CO2',
-            'CH4': f"CH4 ({gwp_to_use})",
-            'N2O': f"N2O ({gwp_to_use})",
-            'HFCs': f"HFCS ({gwp_to_use})",
-            'PFCs': f"PFCS ({gwp_to_use})",
-            'SF6': f"SF6 ({gwp_to_use})",
-            'NOx': 'NOX',
-            'CO': 'CO', # no mapping, just added for completeness here
-            'NMVOCs': 'NMVOC',
-            'SO2': 'SO2', # no mapping, just added for completeness here
-            'Other halogenated gases with CO2 equivalent conversion factors (3)': f"OTHERHFCS ({gwp_to_use})",
+            "Total 3 Gases": f"CO2CH4N2O ({gwp_to_use})",
+            "Net CO2 (1) (2)": "CO2",
+            "CH4": f"CH4 ({gwp_to_use})",
+            "N2O": f"N2O ({gwp_to_use})",
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "NOx": "NOX",
+            "CO": "CO",  # no mapping, just added for completeness here
+            "NMVOCs": "NMVOC",
+            "SO2": "SO2",  # no mapping, just added for completeness here
+            "Other halogenated gases with CO2 equivalent conversion factors (3)": f"OTHERHFCS ({gwp_to_use})",
         },
     }
 
-
     filter_remove = {
-        "fHFC": {"entity": 'Other halogenated gases without CO2 equivalent conversion factors (4)'}
+        "fHFC": {
+            "entity": "Other halogenated gases without CO2 equivalent conversion "
+            "factors (4)"
+        }
     }
 
     filter_keep = {}
@@ -107,84 +116,113 @@ if __name__ == "__main__":
     }
 
     # convert to mass units where possible
-    entities_to_convert_to_mass = [
-        'CH4', 'N2O', 'SF6'
-    ]
+    entities_to_convert_to_mass = ["CH4", "N2O", "SF6"]
 
-    # CO2 equivalents don't make sense for these substances, so unit has to be Gg instead of Gg CO2 equivalents as indicated in the table
-    entities_to_fix_unit = [
-        'NOx', 'CO', 'NMVOCs', 'SO2'
-    ]
+    # CO2 equivalents don't make sense for these substances, so unit has to be Gg
+    # instead of Gg CO2 equivalents as indicated in the table
+    entities_to_fix_unit = ["NOx", "CO", "NMVOCs", "SO2"]
 
     # add the data for the last page by hand as it's only one row
     data_last_page = [
-        ['5B Other (please specify)', 'Total 3 Gases', 'GgCO2eq', '2019', 'NE'],
-        ['5B Other (please specify)', 'Net CO2 (1) (2)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'CH4', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'N2O', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'HFCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'PFCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'SF6', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'Other halogenated gases with CO2 equivalent conversion factors (3)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'Other halogenated gases without CO2 equivalent conversion factors (4)', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'NOx', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'CO', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'NMVOCs', 'GgCO2eq', '2019', np.nan],
-        ['5B Other (please specify)', 'SO2', 'GgCO2eq', '2019', np.nan],
+        ["5B Other (please specify)", "Total 3 Gases", "GgCO2eq", "2019", "NE"],
+        ["5B Other (please specify)", "Net CO2 (1) (2)", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "CH4", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "N2O", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "HFCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "PFCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "SF6", "GgCO2eq", "2019", np.nan],
+        [
+            "5B Other (please specify)",
+            "Other halogenated gases with CO2 equivalent conversion factors (3)",
+            "GgCO2eq",
+            "2019",
+            np.nan,
+        ],
+        [
+            "5B Other (please specify)",
+            "Other halogenated gases without CO2 equivalent conversion factors (4)",
+            "GgCO2eq",
+            "2019",
+            np.nan,
+        ],
+        ["5B Other (please specify)", "NOx", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "CO", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "NMVOCs", "GgCO2eq", "2019", np.nan],
+        ["5B Other (please specify)", "SO2", "GgCO2eq", "2019", np.nan],
     ]
 
     df_last_page = pd.DataFrame(data_last_page, columns=header_long)
 
     aggregate_cats = {
-        '1.A.4': {'sources': ['1.A.4.a', '1.A.4.b'], 'name': 'Other Sectors (calculated)'},
-        '2.A.4': {'sources': ['2.A.4.a', '2.A.4.b', '2.A.4.d'], 'name': 'Other Process uses of Carbonates (calculated)'},
-        '2.B.8': {'sources': ['2.B.8.a', '2.B.8.b', '2.B.8.c', '2.B.8.f'], 'name': 'Petrochemical and Carbon Black production (calculated)'},
-        '4.A': {'sources': ['4.A.2', 'M.4.A.Ind'], 'name': 'Solid Waste Disposal (calculated)'},
+        "1.A.4": {
+            "sources": ["1.A.4.a", "1.A.4.b"],
+            "name": "Other Sectors (calculated)",
+        },
+        "2.A.4": {
+            "sources": ["2.A.4.a", "2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates (calculated)",
+        },
+        "2.B.8": {
+            "sources": ["2.B.8.a", "2.B.8.b", "2.B.8.c", "2.B.8.f"],
+            "name": "Petrochemical and Carbon Black production (calculated)",
+        },
+        "4.A": {
+            "sources": ["4.A.2", "M.4.A.Ind"],
+            "name": "Solid Waste Disposal (calculated)",
+        },
     }
 
     aggregate_cats_N2O = {
-        '3.A.2': {'sources': ['3.A.2.b'], 'name': '3A2 Manure Management'},
-        '3.A': {'sources': ['3.A.2'], 'name': '3A Livestock'},
+        "3.A.2": {"sources": ["3.A.2.b"], "name": "3A2 Manure Management"},
+        "3.A": {"sources": ["3.A.2"], "name": "3A Livestock"},
     }
 
     aggregate_cats_CO2CH4N2O = {
-        '3.A.2': {'sources': ['3.A.2', '3.A.2.b'], 'name': '3A2 Manure Management'},
+        "3.A.2": {"sources": ["3.A.2", "3.A.2.b"], "name": "3A2 Manure Management"},
     }
 
     df_all = None
 
     for page in pages_to_read:
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=str(page),
-                                  flavor='lattice')
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=str(page), flavor="lattice"
+        )
         df_this_table = tables[0].df
         # replace line breaks, double, and triple spaces in category names
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         # replace line breaks in units and entities
-        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace('\n',
-                                                                                    '')
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace('\n', '')
+        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace(
+            "\n", ""
+        )
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace(
+            "\n", ""
+        )
 
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_entity=".*",
-                                                           default_unit="GgCO2eq")  # , **unit_info)
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_entity=".*",
+            default_unit="GgCO2eq",
+        )
 
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year, header_long
+        )
         df_this_table_long["orig_cat_name"] = df_this_table_long["orig_cat_name"].str[0]
 
         # combine with tables for other sectors (merge not append)
         if df_all is None:
             df_all = df_this_table_long
         else:
-            df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
     # add the last page manually
-    df_all = pd.concat([df_all, df_last_page], axis=0, join='outer')
+    df_all = pd.concat([df_all, df_last_page], axis=0, join="outer")
 
     # fix the units of aerosols and precursors
     for entity in entities_to_fix_unit:
@@ -196,22 +234,24 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
 
     ###### convert to primap2 IF
 
     # replace "," with "" in data
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(',','', regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(",", "", regex=False)
 
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
 
-
-
     # ###
     # convert to PRIMAP2 interchange format
     # ###
@@ -222,12 +262,13 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+        time_format=time_format,
+    )
 
     cat_label = "category (IPCC2006)"
 
@@ -244,10 +285,9 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -255,8 +295,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, cat_label, cat_to_agg)
             df_combine.insert(1, "orig_cat_name", aggregate_cats[cat_to_agg]["name"])
@@ -267,10 +314,10 @@ if __name__ == "__main__":
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-
     # delete cat 3 for N2O as it's wrong
-    index_3A_N2O = data_if[(data_if[cat_label] == '3') &
-                           (data_if['entity'] == 'N2O')].index
+    index_3A_N2O = data_if[
+        (data_if[cat_label] == "3") & (data_if["entity"] == "N2O")
+    ].index
     data_if = data_if.drop(index_3A_N2O)
 
     # aggregate cat 3 for N2O
@@ -283,10 +330,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -294,11 +341,20 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name", aggregate_cats_N2O[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_cats_N2O[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
@@ -307,8 +363,9 @@ if __name__ == "__main__":
             print(f"no data to aggregate category {cat_to_agg}")
 
     # delete cat 3.A.2 for CO2CH4N2O as it's wrong
-    index_3A2_CO2CH4N2O = data_if[(data_if[cat_label] == '3.A.2') &
-                           (data_if['entity'] == 'CH4CO2N2O (SARGWP100)')].index
+    index_3A2_CO2CH4N2O = data_if[
+        (data_if[cat_label] == "3.A.2") & (data_if["entity"] == "CH4CO2N2O (SARGWP100)")
+    ].index
     data_if = data_if.drop(index_3A2_CO2CH4N2O)
 
     # aggregate cat 3 for N2O
@@ -321,10 +378,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -332,11 +389,20 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name", aggregate_cats_CO2CH4N2O[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_cats_CO2CH4N2O[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
@@ -344,7 +410,6 @@ if __name__ == "__main__":
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-
     data_if.attrs = attrs
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
@@ -372,9 +437,11 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Israel/__init__.py

@@ -0,0 +1,30 @@
+"""Read Israel's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'ISR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=ISR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 409 - 314
src/unfccc_ghg_data/unfccc_reader/Israel/config_isr_bur2.py

@@ -1,73 +1,91 @@
+"""Config for Israel's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 #### configuration for trend tables
 import locale
 
-gwp_to_use = 'SARGWP100'
-terminology_proc = 'IPCC2006_PRIMAP'
+gwp_to_use = "SARGWP100"
+terminology_proc = "IPCC2006_PRIMAP"
 # bunkers [0,1] need different specs
 trend_table_def = {
     # only GHG read, rest dropped
-    'GHG': {
-        'tables': [2],
-        'cols_add': {
-            'unit': 'ktCO2eq',
-            'category': '0',
+    "GHG": {
+        "tables": [2],
+        "cols_add": {
+            "unit": "ktCO2eq",
+            "category": "0",
         },
-        'given_col': 'entity',
-        'take_only': ['Total GHG'],
+        "given_col": "entity",
+        "take_only": ["Total GHG"],
     },
-    'CO2': {
-        'tables': [3],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'CO2',
+    "CO2": {
+        "tables": [3],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "CO2",
         },
-        'given_col': 'category',
+        "given_col": "category",
     },
-    'CH4': {
-        'tables': [5],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'CH4',
+    "CH4": {
+        "tables": [5],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "CH4",
         },
-        'given_col': 'category',
-        'take_only': [
-            'Total emissions', 'From fuel combustion',
-            'From Industrial processes', 'From Agriculture'
-        ], # ignore the waste time series as they don't cover the full sector
+        "given_col": "category",
+        "take_only": [
+            "Total emissions",
+            "From fuel combustion",
+            "From Industrial processes",
+            "From Agriculture",
+        ],  # ignore the waste time series as they don't cover the full sector
         # and lead to problems becaus eof the methodology chnage in the inventory
     },
-    'N2O': {
-        'tables': [6],
-        'cols_add': {
-            'unit': 'kt',
-            'entity': 'N2O',
+    "N2O": {
+        "tables": [6],
+        "cols_add": {
+            "unit": "kt",
+            "entity": "N2O",
         },
-        'given_col': 'category',
+        "given_col": "category",
     },
-    'FGases': {
-        'tables': [7],
-        'cols_add': {
-            'unit': 'ktCO2eq',
-            'category': '0',
+    "FGases": {
+        "tables": [7],
+        "cols_add": {
+            "unit": "ktCO2eq",
+            "category": "0",
         },
-        'given_col': 'entity',
+        "given_col": "entity",
     },
 }
 
 #### configuration for inventory tables
 inv_tab_conf = {
-    'unit_row': 0,
-    'entity_row': 0,
-    'regex_unit': r"\((.*)\)",
-    'regex_entity': r"^(.*)\s\(",
-    'index_cols': 'category',
-    'cat_pos': (0, 0),
-    'header_long': ["category", "entity", "unit", "time", "data"],
-    'header_2010': ["2010", "CO2 emissions (Gg)", "CO2 removals (Gg)",
-                  "CH4 (Gg)", "N2O (Gg)", "CO (Gg)", "NOx (Gg)",
-                  "NMVOCs (Gg)", "SOx (Gg)", "SF6 (CO2eq Gg)",
-                  "HFCs (CO2eq Gg)", "PFCs (CO2eq Gg)"],
-    'unit_repl': {
+    "unit_row": 0,
+    "entity_row": 0,
+    "regex_unit": r"\((.*)\)",
+    "regex_entity": r"^(.*)\s\(",
+    "index_cols": "category",
+    "cat_pos": (0, 0),
+    "header_long": ["category", "entity", "unit", "time", "data"],
+    "header_2010": [
+        "2010",
+        "CO2 emissions (Gg)",
+        "CO2 removals (Gg)",
+        "CH4 (Gg)",
+        "N2O (Gg)",
+        "CO (Gg)",
+        "NOx (Gg)",
+        "NMVOCs (Gg)",
+        "SOx (Gg)",
+        "SF6 (CO2eq Gg)",
+        "HFCs (CO2eq Gg)",
+        "PFCs (CO2eq Gg)",
+    ],
+    "unit_repl": {
         "SF6 (CO2e Gg)": "GgCO2eq",
         "HFCs (CO2eGg)": "GgCO2eq",
         "PFCs (CO2e Gg)": "GgCO2eq",
@@ -78,13 +96,13 @@ inv_tab_conf = {
 }
 
 inv_table_def = {
-    '1996': {'tables': [1, 2]},
-    '2000': {'tables': [3, 4]},
-    '2005': {'tables': [5, 6]},
-    '2010': {'tables': [7, 8]},
-    '2015': {'tables': [9, 10, 11]},
-    '2019': {'tables': [12, 13, 14]},
-    '2020': {'tables': [15, 16]},
+    "1996": {"tables": [1, 2]},
+    "2000": {"tables": [3, 4]},
+    "2005": {"tables": [5, 6]},
+    "2010": {"tables": [7, 8]},
+    "2015": {"tables": [9, 10, 11]},
+    "2019": {"tables": [12, 13, 14]},
+    "2020": {"tables": [15, 16]},
 }
 
 #### configuration for PM2 format
@@ -110,114 +128,114 @@ coords_defaults = {
 coords_value_mapping = {
     "unit": "PRIMAP1",
     "category": {
-        'Total national emissions and removals': '24540',
-        '0': '24540', # no mapping, just for completeness
-        'Total emissions and removals': '24540',
-        'Total emissions': '24540',
-        '1. Energy': '1',
-        'A. Fuel combustion (sectoral approach)': '1.A',
-        'A. From fuel combustion': '1.A',
-        'From fuel combustion': '1.A',
-        '1. Energy industries': '1.A.1',
-        '2. Manufacturing industries and construction': '1.A.2',
-        '2. Manufacturing, industries and construction': '1.A.2',
-        '3. Transport': '1.A.3',
-        '4. Other sectors': '1.A.4',
-        '4. Other': '1.A.4',
-        'Commercial, institutional residential sectors': '1.A.4.ab', # not BURDI
-        'Commercial, institutional': '1.A.4.a', #not BURDI
-        'residential sectors': '1.A.4.b', #not BURDI
-        'Agriculture, forestry and fishing': '1.A.4.c', # not BURDI
-        '5. Other (please specify)': '1.A.5',
-        'B. Fugitive emissions from fuels': '1.B',
-        '1. Solid fuels': '1.B.1',
-        '2. Oil and natural gas': '1.B.2',
-        '2. Industrial processes': '2',
-        'B. industrial processes': '2',
-        'From Industrial processes': '2',
-        'A. Mineral products': '2.A',
-        'CEMENT PRODUCTION': '2.A.1',
-        'PRODUCTION OF LIME': '2.A.2',
-        'SODA ASH USE': '2.A.4.b',
-        'ROAD PAVING WITH ASPHALT': '2.A.6',
-        'Container Glass': '2.A.7.a',
-        'B. Chemical industry': '2.B',
-        'NITRIC ACID PRODUCTION': '2.B.2',
-        'Ethylene': '2.B.5.b',
-        'PRODUCTION OF OTHER CHEMICALS': '2.B.5.g', #not BURDI
-        'Sulphuric Acid': '2.B.5.f', #not BURDI
-        'C. Metal production': '2.C',
-        'D. Other production': '2.D',
-        'E. Production of halocarbons and sulphur hexafluoride': '2.E',
-        'F. Consumption of halocarbons and sulphur hexafluoride': '2.F',
-        'G. Other (IPPU)': '2.G',
-        '3. Solvent and other product use': '3',
-        '4. Agriculture': '4',
-        'From Agriculture': '4',
-        'From agriculture': '4',
-        'A. Enteric fermentation': '4.A',
-        'B. Manure management': '4.B',
-        'C. Rice cultivation': '4.C',
-        'D. Agricultural soils': '4.D',
-        'E. Prescribed burning of savannahs': '4.E',
-        'F. Field burning of agricultural residues': '4.F',
-        'G. Other (Agri)': '4.G',
-        '5. Land-use change and forestry': '5',
-        'C. Land-use change and forestry': '5',
-        'A. Changes in forest and other woody biomass stocks': '5.A',
-        '2. Changes in forest and other woody biomass stocks': '5.A',
-        'B. Forest and grassland conversion': '5.B',
-        'C. Abandonment of managed lands': '5.C',
-        'D. CO2 emissions and removals from soil': '5.D',
-        '1. CO2 emissions and removals from soil': '5.D',
-        'E. Other (LULUCF)': '5.E',
+        "Total national emissions and removals": "24540",
+        "0": "24540",  # no mapping, just for completeness
+        "Total emissions and removals": "24540",
+        "Total emissions": "24540",
+        "1. Energy": "1",
+        "A. Fuel combustion (sectoral approach)": "1.A",
+        "A. From fuel combustion": "1.A",
+        "From fuel combustion": "1.A",
+        "1. Energy industries": "1.A.1",
+        "2. Manufacturing industries and construction": "1.A.2",
+        "2. Manufacturing, industries and construction": "1.A.2",
+        "3. Transport": "1.A.3",
+        "4. Other sectors": "1.A.4",
+        "4. Other": "1.A.4",
+        "Commercial, institutional residential sectors": "1.A.4.ab",  # not BURDI
+        "Commercial, institutional": "1.A.4.a",  # not BURDI
+        "residential sectors": "1.A.4.b",  # not BURDI
+        "Agriculture, forestry and fishing": "1.A.4.c",  # not BURDI
+        "5. Other (please specify)": "1.A.5",
+        "B. Fugitive emissions from fuels": "1.B",
+        "1. Solid fuels": "1.B.1",
+        "2. Oil and natural gas": "1.B.2",
+        "2. Industrial processes": "2",
+        "B. industrial processes": "2",
+        "From Industrial processes": "2",
+        "A. Mineral products": "2.A",
+        "CEMENT PRODUCTION": "2.A.1",
+        "PRODUCTION OF LIME": "2.A.2",
+        "SODA ASH USE": "2.A.4.b",
+        "ROAD PAVING WITH ASPHALT": "2.A.6",
+        "Container Glass": "2.A.7.a",
+        "B. Chemical industry": "2.B",
+        "NITRIC ACID PRODUCTION": "2.B.2",
+        "Ethylene": "2.B.5.b",
+        "PRODUCTION OF OTHER CHEMICALS": "2.B.5.g",  # not BURDI
+        "Sulphuric Acid": "2.B.5.f",  # not BURDI
+        "C. Metal production": "2.C",
+        "D. Other production": "2.D",
+        "E. Production of halocarbons and sulphur hexafluoride": "2.E",
+        "F. Consumption of halocarbons and sulphur hexafluoride": "2.F",
+        "G. Other (IPPU)": "2.G",
+        "3. Solvent and other product use": "3",
+        "4. Agriculture": "4",
+        "From Agriculture": "4",
+        "From agriculture": "4",
+        "A. Enteric fermentation": "4.A",
+        "B. Manure management": "4.B",
+        "C. Rice cultivation": "4.C",
+        "D. Agricultural soils": "4.D",
+        "E. Prescribed burning of savannahs": "4.E",
+        "F. Field burning of agricultural residues": "4.F",
+        "G. Other (Agri)": "4.G",
+        "5. Land-use change and forestry": "5",
+        "C. Land-use change and forestry": "5",
+        "A. Changes in forest and other woody biomass stocks": "5.A",
+        "2. Changes in forest and other woody biomass stocks": "5.A",
+        "B. Forest and grassland conversion": "5.B",
+        "C. Abandonment of managed lands": "5.C",
+        "D. CO2 emissions and removals from soil": "5.D",
+        "1. CO2 emissions and removals from soil": "5.D",
+        "E. Other (LULUCF)": "5.E",
         # waste in 2006 categories, not BURDI as we will lose info of we map to BURDI and back
-        '6. Waste': '6',
-        'A. Solid waste disposal on land': '6.A',
-        'From solid waste disposal on land': '6.A',
-        'B. Waste-water handling': '6X.B', # combine with 6.D
-        'From waste-water treatment': '6X.B', # not BURDI
-        'C. Waste incineration': '6.C',
-        'D. Other (please specify)': '6X.D', # combine with 6.E
-        'B. Biological Treatment of Solid Waste': '6.B', # not BURDI
-        'D.Waste-water handling': '6.D', # not BURDI
-        'D. Waste-water handling': '6.D', # not BURDI
-        'E. Other (Waste)': '6.E', # not BURDI
-        '7. Other (please specify)': '7',
-        'International bunkers': '14637',
-        'Aviation': '14424',
-        'Marine': '14423',
-        'CO2 emissions from biomass': '14638',
+        "6. Waste": "6",
+        "A. Solid waste disposal on land": "6.A",
+        "From solid waste disposal on land": "6.A",
+        "B. Waste-water handling": "6X.B",  # combine with 6.D
+        "From waste-water treatment": "6X.B",  # not BURDI
+        "C. Waste incineration": "6.C",
+        "D. Other (please specify)": "6X.D",  # combine with 6.E
+        "B. Biological Treatment of Solid Waste": "6.B",  # not BURDI
+        "D.Waste-water handling": "6.D",  # not BURDI
+        "D. Waste-water handling": "6.D",  # not BURDI
+        "E. Other (Waste)": "6.E",  # not BURDI
+        "7. Other (please specify)": "7",
+        "International bunkers": "14637",
+        "Aviation": "14424",
+        "Marine": "14423",
+        "CO2 emissions from biomass": "14638",
     },
     "entity": {
-        'Total GHG': f'KYOTOGHG ({gwp_to_use})',
-        'Carbon Dioxide (CO2)': 'CO2',
-        'CO2': 'CO2', # no mapping, just added for completeness here
-        'CO2 emissions': 'CO2 emissions', # no mapping, just added for completeness here
-        'CO2 removals': 'CO2 removals', # no mapping, just added for completeness here
-        'CO2 Emissions': 'CO2 emissions',
-        'CO2 Removals': 'CO2 removals',
-        'Methane (CH4)': 'CH4',
-        'CH4': 'CH4', # no mapping, just added for completeness here
-        'Nitrous Oxides (N2O)': 'N2O',
-        'NO2': 'NO2', # no mapping, just added for completeness here
-        'Sulfur hexafluoride (SF6)': f'SF6 ({gwp_to_use})',
-        'SF6': f'SF6 ({gwp_to_use})',
-        "Hydrofluorocarbons (HFC'S)": f'HFCS ({gwp_to_use})',
-        "HFCs": f'HFCS ({gwp_to_use})',
-        "Perfluorocarbons (PFC'S)": f'PFCS ({gwp_to_use})',
-        "PFCs": f'PFCS ({gwp_to_use})',
-        'NOx': 'NOX',
-        'Nox': 'NOX',
-        'Co': 'CO',
-        'CO': 'CO', # no mapping, just added for completeness here
-        'NMVOCs': 'NMVOC',
-        'SOx': 'SOX', # no mapping, just added for completeness here
+        "Total GHG": f"KYOTOGHG ({gwp_to_use})",
+        "Carbon Dioxide (CO2)": "CO2",
+        "CO2": "CO2",  # no mapping, just added for completeness here
+        "CO2 emissions": "CO2 emissions",  # no mapping, just added for completeness here
+        "CO2 removals": "CO2 removals",  # no mapping, just added for completeness here
+        "CO2 Emissions": "CO2 emissions",
+        "CO2 Removals": "CO2 removals",
+        "Methane (CH4)": "CH4",
+        "CH4": "CH4",  # no mapping, just added for completeness here
+        "Nitrous Oxides (N2O)": "N2O",
+        "NO2": "NO2",  # no mapping, just added for completeness here
+        "Sulfur hexafluoride (SF6)": f"SF6 ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "Hydrofluorocarbons (HFC'S)": f"HFCS ({gwp_to_use})",
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "Perfluorocarbons (PFC'S)": f"PFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "NOx": "NOX",
+        "Nox": "NOX",
+        "Co": "CO",
+        "CO": "CO",  # no mapping, just added for completeness here
+        "NMVOCs": "NMVOC",
+        "SOx": "SOX",  # no mapping, just added for completeness here
     },
 }
 
 filter_remove = {
-    'rem_cat': {'category': ['Memo items', 'G. Other (please specify)']},
+    "rem_cat": {"category": ["Memo items", "G. Other (please specify)"]},
     #'rem_ent': {'entity': ['GHG per capita', 'GHG per GDP (2015 prices)']},
 }
 
@@ -235,76 +253,88 @@ meta_data = {
 #### for processing
 # aggregate categories
 cats_to_agg = {
-    '1': {'sources': ['1.A'], 'name': 'Energy'}, # for trends
-    '1.A.4': {'sources': ['1.A.4.a', '1.A.4.b', '1.A.4.c', '1.A.4.ab'],
-              'name': 'Other sectors'},
-    '2.A.4': {'sources': ['2.A.4.b'], 'name': 'Soda Ash'},
-    '2.A.7': {'sources': ['2.A.7.a'], 'name': 'Other'},
-    '2.A': {'sources': ['2.A.1', '2.A.2', '2.A.4', '2.A.6', '2.A.7'], 'name': 'Mineral Products'},
-    '2.B.5': {'sources': ['2.B.5.f', '2.B.5.g'], 'name': 'Other'},
-    '2.B': {'sources': ['2.B.2', '2.B.5'], 'name': 'Chemical Industry'},
-    '6.D': {'sources': ['6.D', '6X.B'], 'name': 'Wastewater Treatment and Discharge'},
+    "1": {"sources": ["1.A"], "name": "Energy"},  # for trends
+    "1.A.4": {
+        "sources": ["1.A.4.a", "1.A.4.b", "1.A.4.c", "1.A.4.ab"],
+        "name": "Other sectors",
+    },
+    "2.A.4": {"sources": ["2.A.4.b"], "name": "Soda Ash"},
+    "2.A.7": {"sources": ["2.A.7.a"], "name": "Other"},
+    "2.A": {
+        "sources": ["2.A.1", "2.A.2", "2.A.4", "2.A.6", "2.A.7"],
+        "name": "Mineral Products",
+    },
+    "2.B.5": {"sources": ["2.B.5.f", "2.B.5.g"], "name": "Other"},
+    "2.B": {"sources": ["2.B.2", "2.B.5"], "name": "Chemical Industry"},
+    "6.D": {"sources": ["6.D", "6X.B"], "name": "Wastewater Treatment and Discharge"},
     #'6.E': {'sources': ['6.E', '6X.D'], 'Other'}, # currently empty
 }
 
 # downscale
 # 1.A.4.ab
 downscaling = {
-    'sectors': {
-        '24540': {
-            'basket': '24540',
-            'basket_contents': ['2'],
-            'entities': ['SF6', 'HFCS (SARGWP100)', 'PFCS (SARGWP100)'],
-            'dim': f"category ({coords_terminologies['category']})",
+    "sectors": {
+        "24540": {
+            "basket": "24540",
+            "basket_contents": ["2"],
+            "entities": ["SF6", "HFCS (SARGWP100)", "PFCS (SARGWP100)"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '1.A': {
-            'basket': '1.A',
-            'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
-            'tolerance': 0.05, # some inconsistencies (rounding?)
+        "1.A": {
+            "basket": "1.A",
+            "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
+            "tolerance": 0.05,  # some inconsistencies (rounding?)
         },
-        '1.A.4.ab': {
-            'basket': '1.A.4.ab',
-            'basket_contents': ['1.A.4.a', '1.A.4.b'],
-            'entities': ['CO2', 'CH4', 'N2O', 'SOX', 'NOX', 'CO'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "1.A.4.ab": {
+            "basket": "1.A.4.ab",
+            "basket_contents": ["1.A.4.a", "1.A.4.b"],
+            "entities": ["CO2", "CH4", "N2O", "SOX", "NOX", "CO"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '1.A.4': {
-            'basket': '1.A.4',
-            'basket_contents': ['1.A.4.a', '1.A.4.b', '1.A.4.c'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "1.A.4": {
+            "basket": "1.A.4",
+            "basket_contents": ["1.A.4.a", "1.A.4.b", "1.A.4.c"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '2': {
-            'basket': '2',
-            'basket_contents': ['2.A', '2.B', '2.F'],
-            'entities': ['CO2', 'CH4', 'N2O', 'SF6', 'PFCS (SARGWP100)', 'HFCS (SARGWP100)'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2": {
+            "basket": "2",
+            "basket_contents": ["2.A", "2.B", "2.F"],
+            "entities": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "PFCS (SARGWP100)",
+                "HFCS (SARGWP100)",
+            ],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '2.A': {
-            'basket': '2.A',
-            'basket_contents': ['2.A.1', '2.A.2', '2.A.4', '2.A.7'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2.A": {
+            "basket": "2.A",
+            "basket_contents": ["2.A.1", "2.A.2", "2.A.4", "2.A.7"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '2.B': {
-            'basket': '2.B',
-            'basket_contents': ['2.B.2', '2.B.5'],
-            'entities': ['CO2', 'CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "2.B": {
+            "basket": "2.B",
+            "basket_contents": ["2.B.2", "2.B.5"],
+            "entities": ["CO2", "CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '4': {
-            'basket': '4',
-            'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E', '4.F', '4.G'],
-            'entities': ['CH4', 'N2O'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "4": {
+            "basket": "4",
+            "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E", "4.F", "4.G"],
+            "entities": ["CH4", "N2O"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
-        '5': {
-            'basket': '5',
-            'basket_contents': ['5.A', '5.D'], # the other sectors are 0
-            'entities': ['CO2'],
-            'dim': f"category ({coords_terminologies['category']})",
+        "5": {
+            "basket": "5",
+            "basket_contents": ["5.A", "5.D"],  # the other sectors are 0
+            "entities": ["CO2"],
+            "dim": f"category ({coords_terminologies['category']})",
         },
     },
 }
@@ -312,125 +342,190 @@ downscaling = {
 # map to IPCC2006
 cat_conversion = {
     # ANNEXI to come (low priority as we read from CRF files)
-    'mapping': {
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.A.4.a': '1.A.4.a',
-        '1.A.4.b': '1.A.4.b',
-        '1.A.4.c': '1.A.4.c',
-        '1.A.5': '1.A.5', # currently not needed
-        '1.B': '1.B', # currently not needed
-        '1.B.1': '1.B.1', # currently not needed
-        '1.B.2': '1.B.2', # currently not needed
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1', # cement
-        '2.A.2': '2.A.2', # lime
-        '2.A.4': '2.A.4.b', # soda ash
-        '2.A.6': '2.A.5', # road paving with asphalt -> other
-        '2.A.7.a': '2.A.3', # glass
-        '2.B': 'M.2.B_2.B',
-        '2.B.2': '2.B.2', # nitric acid
-        '2.B.5.b': '2.B.8.b', # Ethylene
-        '2.B.5.f': 'M.2.B.10.a', # sulphuric acid
-        '2.B.5.g': 'M.2.B.10.b', # other chemicals
-        '2.C': '2.C',
-        '2.D': 'M.2.H.1_2',
-        '2.E': '2.B.9',
-        '2.F': '2.F',
-        '2.G': '2.H.3',
-        '4': 'M.AG',
-        '4.A': '3.A.1',
-        '4.B': '3.A.2',
-        '4.C': '3.C.7',
-        '4.D': 'M.3.C.45.AG',
-        '4.E': '3.C.1.c',
-        '4.F': '3.C.1.b',
-        '4.G': '3.C.8',
-        '5': 'M.LULUCF',
-        '6': '4',
-        '6.A': '4.A',
-        '6.B': '4.B',
-        '6.C': '4.C',
-        '6.D': '4.D',
-        '24540': '0',
-        '15163': 'M.0.EL',
-        '14637': 'M.BK',
-        '14424': 'M.BK.A',
-        '14423': 'M.BK.M',
-        '14638': 'M.BIO',
-        '7': '5',
-    }, #5.A-D ignored as not fitting 2006 cats
-
-    'aggregate': {
-        '2.A.4': {'sources': ['2.A.4.b'], 'name': 'Other uses of soda ashes'},
-        '2.B.8': {'sources': ['2.B.8.b'], 'name': 'Petrochemical and Carbon Black production'},
-        '2.B.10': {'sources': ['M.2.B.10.a', 'M.2.B.10.b'], 'name': 'Other'},
-        '2.B': {'sources': ['2.B.2', '2.B.8', '2.B.9', '2.B.10'], 'name': 'Chemical Industry'},
-        '2.H': {'sources': ['M.2.H.1_2', '2.H.3'], 'name': 'Other'},
+    "mapping": {
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.A.4.a": "1.A.4.a",
+        "1.A.4.b": "1.A.4.b",
+        "1.A.4.c": "1.A.4.c",
+        "1.A.5": "1.A.5",  # currently not needed
+        "1.B": "1.B",  # currently not needed
+        "1.B.1": "1.B.1",  # currently not needed
+        "1.B.2": "1.B.2",  # currently not needed
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",  # cement
+        "2.A.2": "2.A.2",  # lime
+        "2.A.4": "2.A.4.b",  # soda ash
+        "2.A.6": "2.A.5",  # road paving with asphalt -> other
+        "2.A.7.a": "2.A.3",  # glass
+        "2.B": "M.2.B_2.B",
+        "2.B.2": "2.B.2",  # nitric acid
+        "2.B.5.b": "2.B.8.b",  # Ethylene
+        "2.B.5.f": "M.2.B.10.a",  # sulphuric acid
+        "2.B.5.g": "M.2.B.10.b",  # other chemicals
+        "2.C": "2.C",
+        "2.D": "M.2.H.1_2",
+        "2.E": "2.B.9",
+        "2.F": "2.F",
+        "2.G": "2.H.3",
+        "4": "M.AG",
+        "4.A": "3.A.1",
+        "4.B": "3.A.2",
+        "4.C": "3.C.7",
+        "4.D": "M.3.C.45.AG",
+        "4.E": "3.C.1.c",
+        "4.F": "3.C.1.b",
+        "4.G": "3.C.8",
+        "5": "M.LULUCF",
+        "6": "4",
+        "6.A": "4.A",
+        "6.B": "4.B",
+        "6.C": "4.C",
+        "6.D": "4.D",
+        "24540": "0",
+        "15163": "M.0.EL",
+        "14637": "M.BK",
+        "14424": "M.BK.A",
+        "14423": "M.BK.M",
+        "14638": "M.BIO",
+        "7": "5",
+    },  # 5.A-D ignored as not fitting 2006 cats
+    "aggregate": {
+        "2.A.4": {"sources": ["2.A.4.b"], "name": "Other uses of soda ashes"},
+        "2.B.8": {
+            "sources": ["2.B.8.b"],
+            "name": "Petrochemical and Carbon Black production",
+        },
+        "2.B.10": {"sources": ["M.2.B.10.a", "M.2.B.10.b"], "name": "Other"},
+        "2.B": {
+            "sources": ["2.B.2", "2.B.8", "2.B.9", "2.B.10"],
+            "name": "Chemical Industry",
+        },
+        "2.H": {"sources": ["M.2.H.1_2", "2.H.3"], "name": "Other"},
         # '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
         #       'name': 'Industrial Processes and Product Use'},
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['3.C.1.b', '3.C.1.c'],
-                     'name': 'Emissions from biomass burning'},
-        'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
-                     'name': 'Emissions from biomass burning (Agriculture)'},
-        '3.C': {'sources': ['3.C.1', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {'sources': ['M.3.C.1.AG', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land ('
-                             'Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National total '
-                                                                    'excluding LULUCF'},
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Emissions from biomass burning",
+        },
+        "M.3.C.1.AG": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Emissions from biomass burning (Agriculture)",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "M.3.C.45.AG", "3.C.7", "3.C.8"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": ["M.3.C.1.AG", "M.3.C.45.AG", "3.C.7", "3.C.8"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land ("
+            "Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total " "excluding LULUCF",
+        },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': 'SARGWP100',
+    "basket_copy": {
+        "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": "SARGWP100",
     },
 }
 
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.2', '1.A.3', '1.A.4', '1.A.4.a', '1.A.4.b', '1.A.4.c',
-    '1.A.5',
-    '1.B', '1.B.1', '1.B.2',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.5',
-    '2.B', '2.B.2', '2.B.8', '2.B.9', '2.B.10', '2.C', '2.F', '2.H',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', 'M.3.C.1.AG', '3.C.7', 'M.3.C.45.AG', '3.C.8', 'M.3.C.AG',
-    'M.LULUCF', 'M.AG.ELV',
-    '4', '4.A', '4.B', '4.C', '4.D',
-    '0', 'M.0.EL', 'M.BK', 'M.BK.A', 'M.BK.M', 'M.BIO', '5']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.2",
+    "1.A.3",
+    "1.A.4",
+    "1.A.4.a",
+    "1.A.4.b",
+    "1.A.4.c",
+    "1.A.5",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.5",
+    "2.B",
+    "2.B.2",
+    "2.B.8",
+    "2.B.9",
+    "2.B.10",
+    "2.C",
+    "2.F",
+    "2.H",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "M.3.C.1.AG",
+    "3.C.7",
+    "M.3.C.45.AG",
+    "3.C.8",
+    "M.3.C.AG",
+    "M.LULUCF",
+    "M.AG.ELV",
+    "4",
+    "4.A",
+    "4.B",
+    "4.C",
+    "4.D",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BK.A",
+    "M.BK.M",
+    "M.BIO",
+    "5",
+]
 
 
 # gas baskets
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)': ['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)': ['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }
 
 basket_copy = {
-    'GWPs_to_add': ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
-    'entities': ["HFCS", "PFCS"],
-    'source_GWP': gwp_to_use,
+    "GWPs_to_add": ["AR4GWP100", "AR5GWP100", "AR6GWP100"],
+    "entities": ["HFCS", "PFCS"],
+    "source_GWP": gwp_to_use,
 }
 
+
 #### functions
 def is_int(input: str) -> bool:
+    """Check if a string evaluates to an integer under a defined locale"""
     try:
         locale.atoi(input)
-        return True
-    except:
+        return True  # noqa: TRY300
+    except Exception:
         return False

+ 121 - 77
src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py

@@ -1,4 +1,12 @@
-# read Israel's BUR2 from pdf
+"""
+Read Israel's BUR2 from pdf
+
+This script reads data from Israel's BUR2
+Data are read from pdf using camelot
+only the 2019 inventory is read as the BUR refers to BUR2 for earlier years
+
+"""
+
 
 # TODO: bunkers trend tables not read because of special format
 
@@ -9,7 +17,7 @@ import pandas as pd
 import primap2 as pm2
 
 # configuration import
-from .config_isr_bur2 import (
+from config_isr_bur2 import (
     basket_copy,
     cat_conversion,
     cats_to_agg,
@@ -29,23 +37,27 @@ from .config_isr_bur2 import (
     trend_table_def,
 )
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 if __name__ == "__main__":
     ### genral configuration
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Israel' / 'BUR2'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Israel'
+    input_folder = downloaded_data_path / "UNFCCC" / "Israel" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Israel"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'ISR_BUR2_2021_'
-    inventory_file_pdf = '2nd_Biennial_Update_Report_2021_final.pdf'
-    #years_to_read = range(1990, 2018 + 1)
+    output_filename = "ISR_BUR2_2021_"
+    inventory_file_pdf = "2nd_Biennial_Update_Report_2021_final.pdf"
+    # years_to_read = range(1990, 2018 + 1)
     pages_to_read_trends = range(48, 54)
     pages_to_read_inventory = range(54, 66)
 
     # define locale to use for str to float conversion
-    locale_to_use = 'en_IL.UTF-8'
+    locale_to_use = "en_IL.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
     compression = dict(zlib=True, complevel=9)
@@ -53,40 +65,44 @@ if __name__ == "__main__":
     #### trend tables
 
     # read
-    tables_trends = camelot.read_pdf(str(input_folder / inventory_file_pdf), pages=','.join(
-        [str(page) for page in pages_to_read_trends]), flavor='lattice')
+    tables_trends = camelot.read_pdf(
+        str(input_folder / inventory_file_pdf),
+        pages=",".join([str(page) for page in pages_to_read_trends]),
+        flavor="lattice",
+    )
 
     # convert to pm2
     table_trends = None
     for table in trend_table_def.keys():
         current_def = trend_table_def[table]
         new_table = None
-        for subtable in current_def['tables']:
+        for subtable in current_def["tables"]:
             if new_table is None:
                 new_table = tables_trends[subtable].df
             else:
                 new_table = pd.concat([new_table, tables_trends[subtable].df])
 
-        for col in new_table.columns.values:
+        for col in new_table.columns.to_numpy():
             new_table[col] = new_table[col].str.replace("\n", "")
 
-        new_table.iloc[0, 0] = current_def['given_col']
+        new_table.iloc[0, 0] = current_def["given_col"]
         new_table.columns = new_table.iloc[0]
         new_table = new_table.drop(labels=[0])
         new_table = new_table.reset_index(drop=True)
 
-        if 'take_only' in current_def.keys():
+        if "take_only" in current_def.keys():
             new_table = new_table[
-                new_table[current_def['given_col']].isin(current_def['take_only'])]
+                new_table[current_def["given_col"]].isin(current_def["take_only"])
+            ]
 
-        time_cols = [col for col in new_table.columns.values if is_int(col)]
+        time_cols = [col for col in new_table.columns.to_numpy() if is_int(col)]
         for col in time_cols:
             # no NE,NA etc, just numbers, so we can just remove the ','
-            new_table[col] = new_table[col].str.replace(',', '')
-            new_table[col] = new_table[col].str.replace(' ', '')
+            new_table[col] = new_table[col].str.replace(",", "")
+            new_table[col] = new_table[col].str.replace(" ", "")
 
-        for col in current_def['cols_add']:
-            new_table[col] = current_def['cols_add'][col]
+        for col in current_def["cols_add"]:
+            new_table[col] = current_def["cols_add"][col]
 
         if table_trends is None:
             table_trends = new_table
@@ -108,31 +124,32 @@ if __name__ == "__main__":
         # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
-        time_format='%Y'
+        time_format="%Y",
     )
 
-
     data_pm2_trends = pm2.pm2io.from_interchange_format(data_if_trends)
 
     #### inventory tables
     # read inventory tables
     tables_inv = camelot.read_pdf(
         str(input_folder / inventory_file_pdf),
-        pages=','.join([str(page) for page in pages_to_read_inventory]),
-        flavor='lattice')
+        pages=",".join([str(page) for page in pages_to_read_inventory]),
+        flavor="lattice",
+    )
 
     # process
     table_inv = None
     for table in inv_table_def.keys():
         new_table = None
         print(f"working on year {table}")
-        for subtable in inv_table_def[table]['tables']:
+        for subtable in inv_table_def[table]["tables"]:
             print(f"adding table {subtable}")
             if new_table is None:
                 new_table = tables_inv[subtable].df
             else:
-                new_table = pd.concat([new_table, tables_inv[subtable].df], axis=0,
-                                      join='outer')
+                new_table = pd.concat(
+                    [new_table, tables_inv[subtable].df], axis=0, join="outer"
+                )
                 new_table = new_table.reset_index(drop=True)
 
             # replace line breaks, double, and triple spaces in category names
@@ -146,75 +163,97 @@ if __name__ == "__main__":
         else:
             # replace line breaks in units and entities
             new_table.iloc[inv_tab_conf["entity_row"]] = new_table.iloc[
-                inv_tab_conf["entity_row"]].str.replace('\n', '')
+                inv_tab_conf["entity_row"]
+            ].str.replace("\n", "")
 
         # get_year
         year = new_table.iloc[inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]]
 
         # set category col label
-        new_table.iloc[inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]] = 'category'
+        new_table.iloc[
+            inv_tab_conf["cat_pos"][0], inv_tab_conf["cat_pos"][1]
+        ] = "category"
 
         new_table = pm2.pm2io.nir_add_unit_information(
             new_table,
-            unit_row=inv_tab_conf["unit_row"], entity_row=inv_tab_conf["entity_row"],
-            regexp_entity=inv_tab_conf["regex_entity"], regexp_unit=inv_tab_conf[
-                "regex_unit"],
-            default_unit="", manual_repl_unit=inv_tab_conf["unit_repl"])
+            unit_row=inv_tab_conf["unit_row"],
+            entity_row=inv_tab_conf["entity_row"],
+            regexp_entity=inv_tab_conf["regex_entity"],
+            regexp_unit=inv_tab_conf["regex_unit"],
+            default_unit="",
+            manual_repl_unit=inv_tab_conf["unit_repl"],
+        )
 
         # fix individual values
-        if table == '1996':
+        if table == "1996":
             loc = new_table[new_table["category"] == "NITRIC ACID PRODUCTION"].index
-            value = new_table.loc[loc, "CH4"].values
+            value = new_table.loc[loc, "CH4"].to_numpy()
             new_table.loc[loc, "N2O"] = value[0, 0]
-            new_table.loc[loc, "CH4"] = ''
-        if table == '2015':
+            new_table.loc[loc, "CH4"] = ""
+        if table == "2015":
             loc_total = new_table[
-                new_table["category"] == "Total national emissions and removals"].index
-            loc_IPPU = new_table[new_table["category"] == "2. Industrial processes"].index
-            value = new_table.loc[loc_IPPU, "PFCs"].values
+                new_table["category"] == "Total national emissions and removals"
+            ].index
+            loc_IPPU = new_table[
+                new_table["category"] == "2. Industrial processes"
+            ].index
+            value = new_table.loc[loc_IPPU, "PFCs"].to_numpy()
             new_table.loc[loc_total, "PFCs"] = value[0, 0]
 
         # remove lines with empty category
         new_table = new_table.drop(new_table[new_table["category"] == ""].index)
 
         # rename E. Other (please specify) according to row above
-        e_locs = list(new_table[new_table["category"] == "E. Other (please specify)"].index)
+        e_locs = list(
+            new_table[new_table["category"] == "E. Other (please specify)"].index
+        )
         for loc in e_locs:
             iloc = new_table.index.get_loc(loc)
-            if new_table.iloc[iloc - 1]["category"][
-                0] == "D. CO2 emissions and removals from soil":
+            if (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "D. CO2 emissions and removals from soil"
+            ):
                 new_table.loc[loc]["category"] = "E. Other (LULUCF)"
-            elif new_table.iloc[iloc - 1]["category"][0] in ["D.Waste-water handling",
-                                                             'D. Waste-water handling']:
+            elif new_table.iloc[iloc - 1]["category"][0] in [
+                "D.Waste-water handling",
+                "D. Waste-water handling",
+            ]:
                 new_table.loc[loc]["category"] = "E. Other (Waste)"
 
         # rename G. Other (please specify) according to row above
-        g_locs = list(new_table[new_table["category"] == "G. Other (please specify)"].index)
+        g_locs = list(
+            new_table[new_table["category"] == "G. Other (please specify)"].index
+        )
         for loc in g_locs:
             iloc = new_table.index.get_loc(loc)
-            if new_table.iloc[iloc - 1]["category"][
-                0] == "F. Field burning of agricultural residues":
+            if (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "F. Field burning of agricultural residues"
+            ):
                 new_table.loc[loc]["category"] = "G. Other (Agri)"
-            elif new_table.iloc[iloc - 1]["category"][
-                0] == "F. Consumption of halocarbons and sulphur hexafluoride":
+            elif (
+                new_table.iloc[iloc - 1]["category"][0]
+                == "F. Consumption of halocarbons and sulphur hexafluoride"
+            ):
                 new_table.loc[loc]["category"] = "G. Other (IPPU)"
 
         # set index and convert to long format
         new_table = new_table.set_index(inv_tab_conf["index_cols"])
-        new_table_long = pm2.pm2io.nir_convert_df_to_long(new_table, year,
-                                                          inv_tab_conf["header_long"])
+        new_table_long = pm2.pm2io.nir_convert_df_to_long(
+            new_table, year, inv_tab_conf["header_long"]
+        )
         # remove line breaks in values
         new_table_long["data"] = new_table_long["data"].str.replace("\n", "")
 
         if table_inv is None:
             table_inv = new_table_long
         else:
-            table_inv = pd.concat([table_inv, new_table_long], axis=0, join='outer')
+            table_inv = pd.concat([table_inv, new_table_long], axis=0, join="outer")
             table_inv = table_inv.reset_index(drop=True)
 
     # no NE,NA etc, just numbers, so we can just remove the ','
-    table_inv["data"] = table_inv["data"].str.replace(',', '')
-    table_inv["data"] = table_inv["data"].str.replace(' ', '')
+    table_inv["data"] = table_inv["data"].str.replace(",", "")
+    table_inv["data"] = table_inv["data"].str.replace(" ", "")
 
     # ###
     # convert to PRIMAP2 interchange format
@@ -231,14 +270,14 @@ if __name__ == "__main__":
         # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
-        time_format='%Y',
+        time_format="%Y",
     )
 
     data_pm2_inv = pm2.pm2io.from_interchange_format(data_if_inv)
 
     #### combine
     # tolerance needs to be high as rounding in trend tables leads to inconsistent data
-    data_pm2 = data_pm2_inv.pr.merge(data_pm2_trends,tolerance=0.11)
+    data_pm2 = data_pm2_inv.pr.merge(data_pm2_trends, tolerance=0.11)
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
 
@@ -248,40 +287,44 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw"), data_if)
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     #### processing
     data_proc_pm2 = data_pm2
 
     # combine CO2 emissions and removals
     temp_CO2 = data_proc_pm2["CO2"].copy()
-    #data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].to_array()
+    # data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].to_array()
     # .pr.sum(dim="variable", skipna=True, min_count=1)
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
     data_proc_pm2["CO2"].attrs = temp_CO2.attrs
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
 
     # actual processing
     country_processing_step1 = {
-        'aggregate_cats': cats_to_agg,
+        "aggregate_cats": cats_to_agg,
     }
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         processing_info_country=country_processing_step1,
     )
 
     country_processing_step2 = {
-        'downscale': downscaling,
-        'basket_copy': basket_copy,
+        "downscale": downscaling,
+        "basket_copy": basket_copy,
     }
 
     data_proc_pm2 = process_data_for_country(
@@ -289,16 +332,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
 
     # adapt source and metadata
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -307,9 +350,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Malaysia/__init__.py

@@ -0,0 +1,30 @@
+"""Read Malaysia's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MYS'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MYS
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 922 - 602
src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur3.py

@@ -1,16 +1,22 @@
+"""Config for Malaysia's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 gwp_to_use = "AR4GWP100"
 
 
 cat_names_fix = {
-    '2A3 Glass Prod.': '2A3 Glass Production',
-    '2F6 Other Applications': '2F6 Other Applications (please specify)',
-    '3A2 Manure Mngmt': '3A2 Manure Mngmt.',
-    '3C7 Rice Cultivations': '3C7 Rice Cultivation',
+    "2A3 Glass Prod.": "2A3 Glass Production",
+    "2F6 Other Applications": "2F6 Other Applications (please specify)",
+    "3A2 Manure Mngmt": "3A2 Manure Mngmt.",
+    "3C7 Rice Cultivations": "3C7 Rice Cultivation",
 }
 
 values_replacement = {
-    '': '-',
-    ' ': '-',
+    "": "-",
+    " ": "-",
 }
 
 cols_for_space_stripping = ["Categories"]
@@ -18,25 +24,25 @@ cols_for_space_stripping = ["Categories"]
 index_cols = ["Categories", "entity", "unit"]
 
 # parameters part 2: conversion to interchange format
-cats_remove = ['Memo items', 'Information items']
+cats_remove = ["Memo items", "Information items"]
 
 cat_codes_manual = {
-    'Annual change in long-term storage of carbon in HWP waste': 'M.LTS.AC.HWP',
-    'Annual change in total long-term storage of carbon stored': 'M.LTS.AC.TOT',
-    'CO2 captured': 'M.CCS',
-    'CO2 from Biomass Burning for Energy Production': 'M.BIO',
-    'For domestic storage': 'M.CCS.DOM',
-    'For storage in other countries': 'M.CCS.OCT',
-    'International Aviation (International Bunkers)': 'M.BK.A',
-    'International Bunkers': 'M.BK',
-    'International Water-borne Transport (International Bunkers)': 'M.BK.M',
-    'Long-term storage of carbon in waste disposal sites': 'M.LTS.WASTE',
-    'Multilateral Operations': 'M.MULTIOP',
-    'Other (please specify)': 'M.OTHER',
-    'Total National Emissions and Removals': '0',
+    "Annual change in long-term storage of carbon in HWP waste": "M.LTS.AC.HWP",
+    "Annual change in total long-term storage of carbon stored": "M.LTS.AC.TOT",
+    "CO2 captured": "M.CCS",
+    "CO2 from Biomass Burning for Energy Production": "M.BIO",
+    "For domestic storage": "M.CCS.DOM",
+    "For storage in other countries": "M.CCS.OCT",
+    "International Aviation (International Bunkers)": "M.BK.A",
+    "International Bunkers": "M.BK",
+    "International Water-borne Transport (International Bunkers)": "M.BK.M",
+    "Long-term storage of carbon in waste disposal sites": "M.LTS.WASTE",
+    "Multilateral Operations": "M.MULTIOP",
+    "Other (please specify)": "M.OTHER",
+    "Total National Emissions and Removals": "0",
 }
 
-cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,4})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,4})\s.*"
 
 coords_terminologies = {
     "area": "ISO3",
@@ -48,17 +54,12 @@ coords_defaults = {
     "source": "MYS-GHG-inventory",
     "provenance": "measured",
     "area": "MYS",
-    "scenario": "BUR3"
+    "scenario": "BUR3",
 }
 
-coords_value_mapping = {
-}
+coords_value_mapping = {}
 
-coords_cols = {
-    "category": "Categories",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "Categories", "entity": "entity", "unit": "unit"}
 
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
@@ -76,600 +77,919 @@ meta_data = {
 terminology_proc = coords_terminologies["category"]
 
 table_def_templates = {
-    '184': { #184
-        "area": ['54,498,793,100'],
-        "cols": ['150,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-               ],
-        },
-    },
-    '185': { #184
-        "area": ['34,504,813,99'],
-        "cols": ['128,177,224,273,321,373,425,473,519,564,611,661,713,765'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A Mineral',
-                '2A1 Cement', '2A2 Lime',
-               ],
-        },
-    },
-    '186': { #also 200
-        "area": ['53,498,786,104'],
-        "cols": ['150,197,238,296,347,396,444,489,540,587,634,686,739'],
-        "rows_to_fix": {
-            3: ['2A3 Glass', '2A4 Other Process', '2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2B9 Fluorochemical'],
-        },
-    },
-    '187': { # also 201
-        "area": ['39,499,807,91'],
-        "cols": ['132,185,232,280,327,375,425,470,522,568,613,664,713,763'],
-        "rows_to_fix": {
-            3: ['2A3 Glass', '2A4 Other Process', '2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys',
-               ],
-            2: ['2B9 Fluorochemical'],
-            5: ['2B4 Caprolactam,'],
-        },
-    },
-    '188': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C4 Magnesium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-                '2F1 Refrigeration',
-               ],
-            2: ['2E2 TFT Flat Panel', '2E4 Heat Transfer'],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '189': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C4 Magnesium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-                '2F1 Refrigeration',
-               ],
-            2: ['2E2 TFT Flat Panel', '2E4 Heat Transfer'],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '190': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,453,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F2 Foam Blowing', '2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2G1 Electrical', '2G3 N2O from', '3A1 Enteric'],
-        },
-    },
-    '191': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F2 Foam Blowing', '2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2G1 Electrical', '2G3 N2O from', '3A1 Enteric'],
-        },
-    },
-    '192': {
-        "area": ['39,502,807,106'],
-        "cols": ['134,193,245,296,346,400,455,507,556,602,650,701,755'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please', '3D1 Harvested Wood',
-                '3D2 Other (please',
-               ],
-            5: ['3C Aggregate',],
-        },
-    },
-    '193': {
-        "area": ['36,508,815,119'],
-        "cols": ['128,179,228,278,327,379,428,476,525,571,622,670,717,766'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please', '3D1 Harvested',
-                '3D2 Other (please',
-               ],
-            5: ['3C Aggregate',],
-        },
-    },
-    '194': {
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',],
-            2: ['4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4B Biological Treatment', '4D Wastewater', '4D1 Domestic Wastewater',
-                '4D2 Industrial Wastewater',
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '195': {
-        "area": ['78,508,765,103'],
-        "cols": ['191,230,271,314,352,400,438,475,519,566,600,645,686,730'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4B Biological', '4D Wastewater', '4D1 Domestic',
-                '4D2 Industrial', '5B Other (please'
-               ],
-            2: ['4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised',
-                '4A Solid Waste',
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '196': {
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-borne',
-                'CO2 from Biomass Burning', 'For storage in other',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-        },
-    },
-    '197': {
-        "area": ['74,507,779,201'],
-        "cols": ['182,226,268,311,354,398,444,482,524,565,610,654,693,733'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-',
-                'CO2 from Biomass', 'For storage in other',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-        },
-    },
-    '198': { # first CH4 table
-        "area": ['54,498,793,100'],
-        "cols": ['140,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-               ],
-            -3: ['2A Mineral Industry'],
-        },
-    },
-    '199': {
-        "area": ['34,506,818,97'],
-        "cols": ['132,177,228,276,329,377,432,479,528,574,618,667,722,774'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A1 Cement',
-                '2A Mineral', '2A2 Lime',
-               ],
-        },
-    },
-    '202': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-               ],
-            2: ['2C4 Magnesium', '2E2 TFT Flat Panel', '2E4 Heat Transfer',
-                '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '203': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C3 Aluminium', '2C7 Other (please',
-                '2D Non-Energy', '2D2 Paraffin Wax', '2D4 Other (please',
-                '2E Electronics', '2E1 Integrated', '2E5 Other (please',
-               ],
-            2: ['2C4 Magnesium', '2E2 TFT Flat Panel', '2E4 Heat Transfer',
-                '2F1 Refrigeration'
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '204': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,455,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-                '3A1 Enteric',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G3 N2O from'],
-        },
-    },
-    '205': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product',
-                '2G2 SF6 and PFCs', '2G4 Other (Please', '2H1 Pulp and Paper',
-                '2H2 Food and', '2H3 Other (please', '3 AGRICULTURE,',
-                '3A1 Enteric',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G3 N2O from'],
-        },
-    },
-    '206': { #also 220
-        "area": ['39,502,807,106'],
-        "cols": ['134,193,245,296,346,400,455,507,556,602,650,701,755'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please',
-                '3D2 Other (please',
-               ],
-            2: ['3D1 Harvested Wood',],
-            5: ['3C Aggregate',],
-        },
-    },
-    '207': { # also 221
-        "area": ['36,508,815,110'],
-        "cols": ['128,179,228,278,327,379,428,476,527,571,622,670,717,766'],
-        "rows_to_fix": {
-            3: ['3C1 Emissions from', '3C4 Direct N2O', '3C5 Indirect N2O',
-                '3C6 Indirect N2O', '3C8 Other (please',
-                '3D2 Other (please',
-               ],
-            2: ['3D1 Harvested',],
-            5: ['3C Aggregate',],
-        },
-    },
-    '208': { # also 222
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4B Biological Treatment', '4D Wastewater', '4D1 Domestic Wastewater',
-                '4D2 Industrial Wastewater'
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '209': { # also 223
-        "area": ['78,508,765,103'],
-        "cols": ['191,230,271,314,352,400,438,475,519,560,600,645,686,730'],
-        "rows_to_fix": {
-            3: ['4C Incineration and', '4C2 Open Burning of', '4E Other',
-                '4B Biological', '4D Wastewater', '4D1 Domestic',
-                '4D2 Industrial', '5B Other (please',
-                '4A1 Managed Waste', '4A2 Unmanaged Waste', '4A3 Uncategorised',
-                '4A Solid Waste'
-               ],
-            5: ['5A Indirect N2O'],
-        },
-    },
-    '210': { # also 224
-        "area": ['80,502,762,151'],
-        "cols": ['201,243,285,329,376,419,462,502,551,591,635,679,724'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-borne',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-',
-               ],
-            2: ['CO2 from Biomass Burning', 'For storage in other',],
-        },
-    },
-    '211': { # also 225
-        "area": ['74,507,779,201'],
-        "cols": ['182,226,268,311,354,398,444,482,524,565,610,654,693,733'],
-        "rows_to_fix": {
-            3: ['International Aviation', 'International Water-',
-                'Long-term storage of', 'Annual change in total',
-                'Annual change in long-', 'CO2 from Biomass',
-               ],
-            2: ['For storage in other',],
-        },
-    },
-    '212': {
-        "area": ['54,498,793,100'],
-        "cols": ['150,197,250,296,346,394,444,493,540,587,637,685,738'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel Combustion', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other emissions',
-                '1C Carbon Dioxide', '2 INDUSTRIAL',
-               ],
-            2: ['2A1 Cement',],
-        },
-    },
-    '213': {
-        "area": ['34,504,813,99'],
-        "cols": ['128,177,224,273,321,373,425,473,519,564,611,661,713,765'],
-        "rows_to_fix": {
-            3: ['Total National', '1A Fuel', '1A1 Energy', '1A2 Manufacturing',
-                '1B Fugitive', '1B2 Oil and Natural', '1B3 Other',
-                '1C Carbon Dioxide', '2 INDUSTRIAL', '2A Mineral',
-               ],
-            2: ['2A1 Cement', '2A2 Lime',],
-        },
-    },
-    '214': {
-        "area": ['47,499,801,93'],
-        "cols": ['141,197,246,297,350,396,453,502,550,595,642,692,748'],
-        "rows_to_fix": {
-            3: ['2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2A3 Glass', '2A4 Other Process', '2B9 Fluorochemical'],
-            -3: ['2C Metal Industry'],
-        },
-    },
-    '215': {
-        "area": ['39,499,807,91'],
-        "cols": ['132,180,232,280,327,375,425,470,522,568,613,664,713,763'],
-        "rows_to_fix": {
-            3: ['2A5 Other (please',
-                '2B Chemical', '2B1 Ammonia', '2B2 Nitric Acid',
-                '2B3 Adipic Acid', '2B4 Caprolactam,', '2B5 Carbide',
-                '2B6 Titanium Dioxide', '2B7 Soda Ash', '2B8 Petrochemical',
-                '2B10 Other (Please', '2C1 Iron and Steel', '2C2 Ferroalloys'
-               ],
-            2: ['2A4 Other Process', '2B9 Fluorochemical'],
-            -3: ['2C Metal Industry'],
-        },
-    },
-    '216': {
-        "area": ['48,503,802,92'],
-        "cols": ['146,194,245,295,346,400,452,500,549,596,642,695,746'],
-        "rows_to_fix": {
-            3: ['2C7 Other (please', '2D Non-Energy', '2D2 Paraffin Wax',
-                '2D4 Other (please', '2E Electronics', '2E1 Integrated',
-                '2E5 Other (please',
-               ],
-            2: ['2C3 Aluminium', '2C4 Magnesium', '2E2 TFT Flat Panel',
-                '2E4 Heat Transfer', '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '217': {
-        "area": ['41,499,806,95'],
-        "cols": ['141,184,233,282,331,376,427,472,520,567,618,665,717,760'],
-        "rows_to_fix": {
-            3: ['2C7 Other (please', '2D Non-Energy', '2D2 Paraffin Wax',
-                '2D4 Other (please', '2E Electronics', '2E1 Integrated',
-                '2E5 Other (please',
-               ],
-            2: ['2C3 Aluminium', '2C4 Magnesium', '2E2 TFT Flat Panel',
-                '2E4 Heat Transfer', '2F1 Refrigeration',
-               ],
-            5: ['2F Product Uses as'],
-        },
-    },
-    '218': {
-        "area": ['45,500,802,125'],
-        "cols": ['146,193,243,295,349,400,455,501,549,595,644,696,748'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product', '2G2 SF6 and PFCs',
-                '2G3 N2O from', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G4 Other (Please',
-                '2H1 Pulp and Paper', '2H2 Food and', '3A1 Enteric',],
-        },
-    },
-    '219': {
-        "area": ['38,498,814,120'],
-        "cols": ['130,180,229,277,326,381,429,477,526,570,620,669,717,765'],
-        "rows_to_fix": {
-            3: ['2F6 Other', '2G Other Product', '2G2 SF6 and PFCs',
-                '2G3 N2O from', '2H3 Other (please', '3 AGRICULTURE,',
-               ],
-            2: ['2F2 Foam Blowing', '2G1 Electrical', '2G4 Other (Please',
-                '2H1 Pulp and Paper', '2H2 Food and', '3A1 Enteric',],
-        },
-    },
-    '226': { # also 334, 238
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '227': { # also 331, 335, 339
-        "area": ['27,510,818,99'],
-        "cols": ['250,290,333,372,413,452,494,536,576,616,656,699,739,781'],
-        "rows_to_fix": {
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '228': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
-        },
-    },
-    '229': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
-        "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
+    "184": {  # 184
+        "area": ["54,498,793,100"],
+        "cols": ["150,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+            ],
+        },
+    },
+    "185": {  # 184
+        "area": ["34,504,813,99"],
+        "cols": ["128,177,224,273,321,373,425,473,519,564,611,661,713,765"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A Mineral",
+                "2A1 Cement",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "186": {  # also 200
+        "area": ["53,498,786,104"],
+        "cols": ["150,197,238,296,347,396,444,489,540,587,634,686,739"],
+        "rows_to_fix": {
+            3: [
+                "2A3 Glass",
+                "2A4 Other Process",
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2B9 Fluorochemical"],
+        },
+    },
+    "187": {  # also 201
+        "area": ["39,499,807,91"],
+        "cols": ["132,185,232,280,327,375,425,470,522,568,613,664,713,763"],
+        "rows_to_fix": {
+            3: [
+                "2A3 Glass",
+                "2A4 Other Process",
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2B9 Fluorochemical"],
+            5: ["2B4 Caprolactam,"],
+        },
+    },
+    "188": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+                "2F1 Refrigeration",
+            ],
+            2: ["2E2 TFT Flat Panel", "2E4 Heat Transfer"],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "189": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+                "2F1 Refrigeration",
+            ],
+            2: ["2E2 TFT Flat Panel", "2E4 Heat Transfer"],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "190": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,453,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F2 Foam Blowing",
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: ["2G1 Electrical", "2G3 N2O from", "3A1 Enteric"],
+        },
+    },
+    "191": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F2 Foam Blowing",
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: ["2G1 Electrical", "2G3 N2O from", "3A1 Enteric"],
+        },
+    },
+    "192": {
+        "area": ["39,502,807,106"],
+        "cols": ["134,193,245,296,346,400,455,507,556,602,650,701,755"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D1 Harvested Wood",
+                "3D2 Other (please",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "193": {
+        "area": ["36,508,815,119"],
+        "cols": ["128,179,228,278,327,379,428,476,525,571,622,670,717,766"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D1 Harvested",
+                "3D2 Other (please",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "194": {
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+            ],
+            2: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4B Biological Treatment",
+                "4D Wastewater",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "195": {
+        "area": ["78,508,765,103"],
+        "cols": ["191,230,271,314,352,400,438,475,519,566,600,645,686,730"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4B Biological",
+                "4D Wastewater",
+                "4D1 Domestic",
+                "4D2 Industrial",
+                "5B Other (please",
+            ],
+            2: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised",
+                "4A Solid Waste",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "196": {
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-borne",
+                "CO2 from Biomass Burning",
+                "For storage in other",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+        },
+    },
+    "197": {
+        "area": ["74,507,779,201"],
+        "cols": ["182,226,268,311,354,398,444,482,524,565,610,654,693,733"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-",
+                "CO2 from Biomass",
+                "For storage in other",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+        },
+    },
+    "198": {  # first CH4 table
+        "area": ["54,498,793,100"],
+        "cols": ["140,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+            ],
+            -3: ["2A Mineral Industry"],
+        },
+    },
+    "199": {
+        "area": ["34,506,818,97"],
+        "cols": ["132,177,228,276,329,377,432,479,528,574,618,667,722,774"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A1 Cement",
+                "2A Mineral",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "202": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "203": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C3 Aluminium",
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "204": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,455,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+                "3A1 Enteric",
+            ],
+            2: ["2F2 Foam Blowing", "2G1 Electrical", "2G3 N2O from"],
+        },
+    },
+    "205": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+                "3A1 Enteric",
+            ],
+            2: ["2F2 Foam Blowing", "2G1 Electrical", "2G3 N2O from"],
+        },
+    },
+    "206": {  # also 220
+        "area": ["39,502,807,106"],
+        "cols": ["134,193,245,296,346,400,455,507,556,602,650,701,755"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D2 Other (please",
+            ],
+            2: [
+                "3D1 Harvested Wood",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "207": {  # also 221
+        "area": ["36,508,815,110"],
+        "cols": ["128,179,228,278,327,379,428,476,527,571,622,670,717,766"],
+        "rows_to_fix": {
+            3: [
+                "3C1 Emissions from",
+                "3C4 Direct N2O",
+                "3C5 Indirect N2O",
+                "3C6 Indirect N2O",
+                "3C8 Other (please",
+                "3D2 Other (please",
+            ],
+            2: [
+                "3D1 Harvested",
+            ],
+            5: [
+                "3C Aggregate",
+            ],
+        },
+    },
+    "208": {  # also 222
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4B Biological Treatment",
+                "4D Wastewater",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "209": {  # also 223
+        "area": ["78,508,765,103"],
+        "cols": ["191,230,271,314,352,400,438,475,519,560,600,645,686,730"],
+        "rows_to_fix": {
+            3: [
+                "4C Incineration and",
+                "4C2 Open Burning of",
+                "4E Other",
+                "4B Biological",
+                "4D Wastewater",
+                "4D1 Domestic",
+                "4D2 Industrial",
+                "5B Other (please",
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised",
+                "4A Solid Waste",
+            ],
+            5: ["5A Indirect N2O"],
+        },
+    },
+    "210": {  # also 224
+        "area": ["80,502,762,151"],
+        "cols": ["201,243,285,329,376,419,462,502,551,591,635,679,724"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-borne",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+            ],
+            2: [
+                "CO2 from Biomass Burning",
+                "For storage in other",
+            ],
+        },
+    },
+    "211": {  # also 225
+        "area": ["74,507,779,201"],
+        "cols": ["182,226,268,311,354,398,444,482,524,565,610,654,693,733"],
+        "rows_to_fix": {
+            3: [
+                "International Aviation",
+                "International Water-",
+                "Long-term storage of",
+                "Annual change in total",
+                "Annual change in long-",
+                "CO2 from Biomass",
+            ],
+            2: [
+                "For storage in other",
+            ],
+        },
+    },
+    "212": {
+        "area": ["54,498,793,100"],
+        "cols": ["150,197,250,296,346,394,444,493,540,587,637,685,738"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel Combustion",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other emissions",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+            ],
+            2: [
+                "2A1 Cement",
+            ],
+        },
+    },
+    "213": {
+        "area": ["34,504,813,99"],
+        "cols": ["128,177,224,273,321,373,425,473,519,564,611,661,713,765"],
+        "rows_to_fix": {
+            3: [
+                "Total National",
+                "1A Fuel",
+                "1A1 Energy",
+                "1A2 Manufacturing",
+                "1B Fugitive",
+                "1B2 Oil and Natural",
+                "1B3 Other",
+                "1C Carbon Dioxide",
+                "2 INDUSTRIAL",
+                "2A Mineral",
+            ],
+            2: [
+                "2A1 Cement",
+                "2A2 Lime",
+            ],
+        },
+    },
+    "214": {
+        "area": ["47,499,801,93"],
+        "cols": ["141,197,246,297,350,396,453,502,550,595,642,692,748"],
+        "rows_to_fix": {
+            3: [
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2A3 Glass", "2A4 Other Process", "2B9 Fluorochemical"],
+            -3: ["2C Metal Industry"],
+        },
+    },
+    "215": {
+        "area": ["39,499,807,91"],
+        "cols": ["132,180,232,280,327,375,425,470,522,568,613,664,713,763"],
+        "rows_to_fix": {
+            3: [
+                "2A5 Other (please",
+                "2B Chemical",
+                "2B1 Ammonia",
+                "2B2 Nitric Acid",
+                "2B3 Adipic Acid",
+                "2B4 Caprolactam,",
+                "2B5 Carbide",
+                "2B6 Titanium Dioxide",
+                "2B7 Soda Ash",
+                "2B8 Petrochemical",
+                "2B10 Other (Please",
+                "2C1 Iron and Steel",
+                "2C2 Ferroalloys",
+            ],
+            2: ["2A4 Other Process", "2B9 Fluorochemical"],
+            -3: ["2C Metal Industry"],
+        },
+    },
+    "216": {
+        "area": ["48,503,802,92"],
+        "cols": ["146,194,245,295,346,400,452,500,549,596,642,695,746"],
+        "rows_to_fix": {
+            3: [
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "217": {
+        "area": ["41,499,806,95"],
+        "cols": ["141,184,233,282,331,376,427,472,520,567,618,665,717,760"],
+        "rows_to_fix": {
+            3: [
+                "2C7 Other (please",
+                "2D Non-Energy",
+                "2D2 Paraffin Wax",
+                "2D4 Other (please",
+                "2E Electronics",
+                "2E1 Integrated",
+                "2E5 Other (please",
+            ],
+            2: [
+                "2C3 Aluminium",
+                "2C4 Magnesium",
+                "2E2 TFT Flat Panel",
+                "2E4 Heat Transfer",
+                "2F1 Refrigeration",
+            ],
+            5: ["2F Product Uses as"],
+        },
+    },
+    "218": {
+        "area": ["45,500,802,125"],
+        "cols": ["146,193,243,295,349,400,455,501,549,595,644,696,748"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G3 N2O from",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: [
+                "2F2 Foam Blowing",
+                "2G1 Electrical",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "3A1 Enteric",
+            ],
+        },
+    },
+    "219": {
+        "area": ["38,498,814,120"],
+        "cols": ["130,180,229,277,326,381,429,477,526,570,620,669,717,765"],
+        "rows_to_fix": {
+            3: [
+                "2F6 Other",
+                "2G Other Product",
+                "2G2 SF6 and PFCs",
+                "2G3 N2O from",
+                "2H3 Other (please",
+                "3 AGRICULTURE,",
+            ],
+            2: [
+                "2F2 Foam Blowing",
+                "2G1 Electrical",
+                "2G4 Other (Please",
+                "2H1 Pulp and Paper",
+                "2H2 Food and",
+                "3A1 Enteric",
+            ],
+        },
+    },
+    "226": {  # also 334, 238
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "227": {  # also 331, 335, 339
+        "area": ["27,510,818,99"],
+        "cols": ["250,290,333,372,413,452,494,536,576,616,656,699,739,781"],
+        "rows_to_fix": {
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "228": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            3: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+        },
+    },
+    "229": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
+        "rows_to_fix": {
+            3: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+        },
+    },
+    "230": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            -3: [
+                "Total National Emissions and Removals",
+                "2 INDUSTRIAL PROCESSES AND PRODUCT USE",
+            ],
+            2: ["2B4 Caprolactam, Glyoxal and Glyoxylic Acid"],
+        },
+    },
+    "232": {  # also 236
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
+        "rows_to_fix": {
+            -3: [
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
         },
     },
-    '230': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            -3: ['Total National Emissions and Removals', '2 INDUSTRIAL PROCESSES AND PRODUCT USE'],
-            2: ['2B4 Caprolactam, Glyoxal and Glyoxylic Acid'],
-        }
-    },
-    '232': { # also 236
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
-        "rows_to_fix": {
-            -3: ['2G2 SF6 and PFCs from Other Product Uses',],
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone',]
-        },
-    },
-    '233': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
-        "rows_to_fix": {
-            -5: ['2F Product Uses as Substitutes for Ozone'],
-            2: ['2D Non-Energy Products from Fuels and Solvent'],
-            -3: ['2G Other Product Manufacture and Use',
-                 '2G2 SF6 and PFCs from Other Product Uses',]
+    "233": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
+        "rows_to_fix": {
+            -5: ["2F Product Uses as Substitutes for Ozone"],
+            2: ["2D Non-Energy Products from Fuels and Solvent"],
+            -3: [
+                "2G Other Product Manufacture and Use",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
     },
-    '237': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
+    "237": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone'],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
         },
     },
-    '240': {
-        "area": ['48,510,797,99'],
-        "cols": ['271,310,350,393,435,475,514,557,594,640,678,719,760'],
+    "240": {
+        "area": ["48,510,797,99"],
+        "cols": ["271,310,350,393,435,475,514,557,594,640,678,719,760"],
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone'],
-            -3: ['2E Electronics Industry',
-                 '2F1 Refrigeration and Air Conditioning',
-                 '2G2 SF6 and PFCs from Other Product Uses',],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+            ],
+            -3: [
+                "2E Electronics Industry",
+                "2F1 Refrigeration and Air Conditioning",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
     },
-    '241': {
-        "area": ['25,512,819,86'],
-        "cols": ['246,291,331,370,412,454,495,536,577,619,656,699,740,777'],
+    "241": {
+        "area": ["25,512,819,86"],
+        "cols": ["246,291,331,370,412,454,495,536,577,619,656,699,740,777"],
         "rows_to_fix": {
-            2: ['2D Non-Energy Products from Fuels and Solvent',
-                '2F Product Uses as Substitutes for Ozone',
-                '2E1 Integrated Circuit or Semiconductor',],
-            -3: ['2F1 Refrigeration and Air Conditioning',
-                 '2G2 SF6 and PFCs from Other Product Uses',],
+            2: [
+                "2D Non-Energy Products from Fuels and Solvent",
+                "2F Product Uses as Substitutes for Ozone",
+                "2E1 Integrated Circuit or Semiconductor",
+            ],
+            -3: [
+                "2F1 Refrigeration and Air Conditioning",
+                "2G2 SF6 and PFCs from Other Product Uses",
+            ],
         },
     },
 }
 
 table_defs = {
-    '184': {"template": '184', "entity": "CO2", "unit": "Gg CO2 / yr"}, #CO2
-    '185': {"template": '185', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '186': {"template": '186', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '187': {"template": '187', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '188': {"template": '188', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '189': {"template": '189', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '190': {"template": '190', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '191': {"template": '191', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '192': {"template": '192', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '193': {"template": '193', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '194': {"template": '194', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '195': {"template": '195', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '196': {"template": '196', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '197': {"template": '197', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '198': {"template": '198', "entity": "CH4", "unit": "Gg CH4 / yr"}, #CH4
-    '199': {"template": '199', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '200': {"template": '186', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '201': {"template": '187', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '202': {"template": '202', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '203': {"template": '203', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '204': {"template": '204', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '205': {"template": '205', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '206': {"template": '206', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '207': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '208': {"template": '208', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '209': {"template": '209', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '210': {"template": '210', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '211': {"template": '211', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '212': {"template": '212', "entity": "N2O", "unit": "Gg N2O / yr"}, #N2O
-    '213': {"template": '213', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '214': {"template": '214', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '215': {"template": '215', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '216': {"template": '216', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '217': {"template": '217', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '218': {"template": '218', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '219': {"template": '219', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '220': {"template": '206', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '221': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '222': {"template": '208', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '223': {"template": '209', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '224': {"template": '210', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '225': {"template": '211', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '226': {"template": '226', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"}, #HFCs
-    '227': {"template": '227', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '228': {"template": '228', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '229': {"template": '229', "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '230': {"template": '230', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"}, #PFCs
-    '231': {"template": '227', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '232': {"template": '232', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '233': {"template": '233', "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '234': {"template": '226', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"}, #SF6
-    '235': {"template": '227', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '236': {"template": '232', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '237': {"template": '237', "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '238': {"template": '226', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"}, #NF3
-    '239': {"template": '227', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '240': {"template": '240', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
-    '241': {"template": '241', "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "184": {"template": "184", "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
+    "185": {"template": "185", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "186": {"template": "186", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "187": {"template": "187", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "188": {"template": "188", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "189": {"template": "189", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "190": {"template": "190", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "191": {"template": "191", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "192": {"template": "192", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "193": {"template": "193", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "194": {"template": "194", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "195": {"template": "195", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "196": {"template": "196", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "197": {"template": "197", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "198": {"template": "198", "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
+    "199": {"template": "199", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "200": {"template": "186", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "201": {"template": "187", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "202": {"template": "202", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "203": {"template": "203", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "204": {"template": "204", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "205": {"template": "205", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "206": {"template": "206", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "207": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "208": {"template": "208", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "209": {"template": "209", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "210": {"template": "210", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "211": {"template": "211", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "212": {"template": "212", "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
+    "213": {"template": "213", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "214": {"template": "214", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "215": {"template": "215", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "216": {"template": "216", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "217": {"template": "217", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "218": {"template": "218", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "219": {"template": "219", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "220": {"template": "206", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "221": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "222": {"template": "208", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "223": {"template": "209", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "224": {"template": "210", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "225": {"template": "211", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "226": {
+        "template": "226",
+        "entity": "HFCS (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # HFCs
+    "227": {"template": "227", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "228": {"template": "228", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "229": {"template": "229", "entity": "HFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "230": {
+        "template": "230",
+        "entity": "PFCS (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # PFCs
+    "231": {"template": "227", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "232": {"template": "232", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "233": {"template": "233", "entity": "PFCS (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "234": {
+        "template": "226",
+        "entity": "SF6 (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # SF6
+    "235": {"template": "227", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "236": {"template": "232", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "237": {"template": "237", "entity": "SF6 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "238": {
+        "template": "226",
+        "entity": "NF3 (AR4GWP100)",
+        "unit": "Gg CO2 / yr",
+    },  # NF3
+    "239": {"template": "227", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "240": {"template": "240", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
+    "241": {"template": "241", "entity": "NF3 (AR4GWP100)", "unit": "Gg CO2 / yr"},
 }
 
 country_processing_step1 = {
-    'aggregate_cats': {
-        'M.3.C.AG': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                 '3.C.6', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land '
-                             '(Agriculture)'},
-        'M.3.D.AG': {'sources': ['3.D.2'],
-                     'name': 'Other (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG', 'M.3.D.AG'],
-                     'name': 'Agriculture excluding livestock'},
-        'M.AG': {'sources': ['3.A', 'M.AG.ELV'],
-                     'name': 'Agriculture'},
-        'M.3.D.LU': {'sources': ['3.D.1'],
-                     'name': 'Other (LULUCF)'},
-        'M.LULUCF': {'sources': ['3.B', 'M.3.D.LU'],
-                     'name': 'LULUCF'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'],
-                     'name': 'National total emissions excluding LULUCF'},
-    },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "aggregate_cats": {
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land "
+            "(Agriculture)",
+        },
+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total emissions excluding LULUCF",
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
 }
 
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }

+ 258 - 253
src/unfccc_ghg_data/unfccc_reader/Malaysia/config_mys_bur4.py

@@ -1,3 +1,9 @@
+"""Config for Malaysia's BUR4
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 gwp_to_use = "AR4GWP100"
 
 
@@ -9,8 +15,8 @@ cat_names_fix = {
 }
 
 values_replacement = {
-    '': '-',
-    ' ': '-',
+    "": "-",
+    " ": "-",
 }
 
 cols_for_space_stripping = ["Categories"]
@@ -18,25 +24,25 @@ cols_for_space_stripping = ["Categories"]
 index_cols = ["Categories", "entity", "unit"]
 
 # parameters part 2: conversion to interchange format
-cats_remove = ['Memo items', 'Information items',  'Information items (1)']
+cats_remove = ["Memo items", "Information items", "Information items (1)"]
 
 cat_codes_manual = {
-    'Annual change in long-term storage of carbon in HWP waste': 'M.LTS.AC.HWP',
-    'Annual change in total long-term storage of carbon stored': 'M.LTS.AC.TOT',
-    'CO2 captured': 'M.CCS',
-    'CO2 from Biomass Burning for Energy Production': 'M.BIO',
-    'For domestic storage': 'M.CCS.DOM',
-    'For storage in other countries': 'M.CCS.OCT',
-    'International Aviation (International Bunkers)': 'M.BK.A',
-    'International Bunkers': 'M.BK',
-    'International Water-borne Transport (International Bunkers)': 'M.BK.M',
-    'Long-term storage of carbon in waste disposal sites': 'M.LTS.WASTE',
-    'Multilateral Operations': 'M.MULTIOP',
-    'Other (please specify)': 'M.OTHER',
-    'Total National Emissions and Removals': '0',
+    "Annual change in long-term storage of carbon in HWP waste": "M.LTS.AC.HWP",
+    "Annual change in total long-term storage of carbon stored": "M.LTS.AC.TOT",
+    "CO2 captured": "M.CCS",
+    "CO2 from Biomass Burning for Energy Production": "M.BIO",
+    "For domestic storage": "M.CCS.DOM",
+    "For storage in other countries": "M.CCS.OCT",
+    "International Aviation (International Bunkers)": "M.BK.A",
+    "International Bunkers": "M.BK",
+    "International Water-borne Transport (International Bunkers)": "M.BK.M",
+    "Long-term storage of carbon in waste disposal sites": "M.LTS.WASTE",
+    "Multilateral Operations": "M.MULTIOP",
+    "Other (please specify)": "M.OTHER",
+    "Total National Emissions and Removals": "0",
 }
 
-cat_code_regexp = r'(?P<code>^[A-Z0-9]{1,4})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Z0-9]{1,4})\s.*"
 
 
 coords_terminologies = {
@@ -49,27 +55,22 @@ coords_defaults = {
     "source": "MYS-GHG-inventory",
     "provenance": "measured",
     "area": "MYS",
-    "scenario": "BUR4"
+    "scenario": "BUR4",
 }
 
-coords_value_mapping = {
-}
+coords_value_mapping = {}
 
-coords_cols = {
-    "category": "Categories",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "Categories", "entity": "entity", "unit": "unit"}
 
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
 }
 
-#filter_remove = {
+# filter_remove = {
 #    "f1": {
 #        "entity": ["CO2(grossemissions)", "CO2(removals)"],
 #    },
-#}
+# }
 
 meta_data = {
     "references": "https://unfccc.int/documents/624776",
@@ -84,318 +85,322 @@ terminology_proc = coords_terminologies["category"]
 
 table_def_templates = {
     # CO2
-    '203': {  # 203, 249
-        "area": ['70,480,768,169'],
+    "203": {  # 203, 249
+        "area": ["70,480,768,169"],
     },
-    '204': {  # 204
-        "area": ['70,500,763,141'],
+    "204": {  # 204
+        "area": ["70,500,763,141"],
     },
-    '205': {  # 205, 209, 2014, 2018
-        "area": ['70,495,763,95'],
+    "205": {  # 205, 209, 2014, 2018
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
-            2: ['5A Indirect N2O emissions from the atmospheric deposition of'],
+            2: ["5A Indirect N2O emissions from the atmospheric deposition of"],
         },
     },
-    '206': {  # 206
-        "area": ['70,495,763,353'],
+    "206": {  # 206
+        "area": ["70,495,763,353"],
     },
-    '207': {  # 207, 208, 211, 212, 213, 215, 217, 223, 227, 231,
+    "207": {  # 207, 208, 211, 212, 213, 215, 217, 223, 227, 231,
         # 251, 257, 259, 263, 265
-        "area": ['70,495,763,95'],
+        "area": ["70,495,763,95"],
     },
-    '216': {  #  216
-        "area": ['70,500,763,95'],
+    "216": {  #  216
+        "area": ["70,500,763,95"],
     },
     # CH4
-    '219': {  # 219, 255
-        "area": ['70,480,768,100'],
+    "219": {  # 219, 255
+        "area": ["70,480,768,100"],
     },
-    '220': {  # 220, 224, 228
-        "area": ['70,495,763,95'],
+    "220": {  # 220, 224, 228
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '221': {  # 221
-        "area": ['92,508,748,92'],
-        "cols": ['298,340,380,422,462,502,542,582,622,662,702'],
+    "221": {  # 221
+        "area": ["92,508,748,92"],
+        "cols": ["298,340,380,422,462,502,542,582,622,662,702"],
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
-            2: ['5A Indirect N2O emissions from the atmospheric'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
+            2: ["5A Indirect N2O emissions from the atmospheric"],
         },
     },
-    '222': {  # 222
-        "area": ['70,495,763,323'],
+    "222": {  # 222
+        "area": ["70,495,763,323"],
         "rows_to_fix": {
-            2: ['Annual change in long-term storage of carbon in HWP'],
+            2: ["Annual change in long-term storage of carbon in HWP"],
         },
     },
-    '225': {  # 225
-        "area": ['92,508,748,92'],
-        "cols": ['311,357,400,443,486,529,572,615,658,701'],
+    "225": {  # 225
+        "area": ["92,508,748,92"],
+        "cols": ["311,357,400,443,486,529,572,615,658,701"],
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
     },
-    '226': {  # 226, 230
-        "area": ['70,495,763,95'],
+    "226": {  # 226, 230
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
-            2: ['5A Indirect N2O emissions from the atmospheric',
-                'Annual change in long-term storage of carbon in HWP'],
+            2: [
+                "5A Indirect N2O emissions from the atmospheric",
+                "Annual change in long-term storage of carbon in HWP",
+            ],
         },
     },
-    '229': {  # 229
-        "area": ['114,508,725,92'],
-        "cols": ['333,379,421,464,506,548,590,632,674'],
+    "229": {  # 229
+        "area": ["114,508,725,92"],
+        "cols": ["333,379,421,464,506,548,590,632,674"],
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
     },
     # N2O
-    '232': {  # 232
-        "area": ['70,495,763,95'],
-        "cols": ['315,366,416,466,516,566,616,666,716'],
+    "232": {  # 232
+        "area": ["70,495,763,95"],
+        "cols": ["315,366,416,466,516,566,616,666,716"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '233': {  # 233
-        "area": ['70,495,763,95'],
+    "233": {  # 233
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
-            3: ['3C Aggregate sources and Non-CO2 emissions'],
+            3: ["3C Aggregate sources and Non-CO2 emissions"],
         },
     },
-    '234': {  # 234
-        "area": ['70,495,763,95'],
+    "234": {  # 234
+        "area": ["70,495,763,95"],
         "rows_to_fix": {
-            3: ['International Water-borne Transport (International'],
+            3: ["International Water-borne Transport (International"],
         },
     },
-    '236': {  # 236
-        "area": ['70,495,763,95'],
-        "cols": ['298,344,392,439,487,534,580,629,675,721'],
+    "236": {  # 236
+        "area": ["70,495,763,95"],
+        "cols": ["298,344,392,439,487,534,580,629,675,721"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '240': {  # 240
-        "area": ['70,495,763,95'],
-        "cols": ['283,329,372,416,459,504,550,594,639,682,726'],
+    "240": {  # 240
+        "area": ["70,495,763,95"],
+        "cols": ["283,329,372,416,459,504,550,594,639,682,726"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
     # HFCs
-    '243': {  # 243
-        "area": ['70,480,763,95'],
-        "cols": ['408,449,489,527,567,604,644,681,721'],
+    "243": {  # 243
+        "area": ["70,480,763,95"],
+        "cols": ["408,449,489,527,567,604,644,681,721"],
     },
-    '244': {  # 244
-        "area": ['70,495,763,95'],
-        "cols": ['408,449,489,527,567,604,644,681,721'],
+    "244": {  # 244
+        "area": ["70,495,763,95"],
+        "cols": ["408,449,489,527,567,604,644,681,721"],
     },
-    '245': {  # 245, 246
-        "area": ['70,495,763,95'],
-        "cols": ['405,442,478,515,550,587,621,657,693,729'],
+    "245": {  # 245, 246
+        "area": ["70,495,763,95"],
+        "cols": ["405,442,478,515,550,587,621,657,693,729"],
     },
-    '247': {  # 247, 248
-        "area": ['70,495,763,95'],
-        "cols": ['384,426,459,493,531,564,597,633,666,700,735'],
+    "247": {  # 247, 248
+        "area": ["70,495,763,95"],
+        "cols": ["384,426,459,493,531,564,597,633,666,700,735"],
     },
     # PFCs
-    '250': {  # 250
-        "area": ['70,495,763,95'],
-        "cols": ['341,389,436,485,531,579,626,674,723'],
+    "250": {  # 250
+        "area": ["70,495,763,95"],
+        "cols": ["341,389,436,485,531,579,626,674,723"],
     },
-    '252': {  # 252
-        "area": ['70,495,763,95'],
-        "cols": ['323,370,415,459,504,547,590,636,680,726'],
+    "252": {  # 252
+        "area": ["70,495,763,95"],
+        "cols": ["323,370,415,459,504,547,590,636,680,726"],
     },
-    '253': {  # 253
-        "area": ['70,495,763,95'],
-        "cols": ['334,378,419,464,511,554,597,636,668,702,735'],
+    "253": {  # 253
+        "area": ["70,495,763,95"],
+        "cols": ["334,378,419,464,511,554,597,636,668,702,735"],
     },
-    '254': {  # 254
-        "area": ['70,495,763,95'],
-        "cols": ['330,378,419,464,511,554,597,636,668,702,735'],
+    "254": {  # 254
+        "area": ["70,495,763,95"],
+        "cols": ["330,378,419,464,511,554,597,636,668,702,735"],
         "rows_to_fix": {
-            -3: ['2F Product Uses as Substitutes for Ozone Depleting Substances'],
+            -3: ["2F Product Uses as Substitutes for Ozone Depleting Substances"],
         },
     },
     # SF6
-    '256': {  # 256
-        "area": ['70,495,763,95'],
-        "cols": ['382,420,462,504,546,588,630,672,714'],
+    "256": {  # 256
+        "area": ["70,495,763,95"],
+        "cols": ["382,420,462,504,546,588,630,672,714"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '258': {  # 258
-        "area": ['70,495,763,95'],
-        "cols": ['363,399,441,481,522,564,606,646,688,728'],
+    "258": {  # 258
+        "area": ["70,495,763,95"],
+        "cols": ["363,399,441,481,522,564,606,646,688,728"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '260': {  # 260
-        "area": ['70,495,763,95'],
-        "cols": ['346,381,419,458,498,536,576,614,652,692,732'],
+    "260": {  # 260
+        "area": ["70,495,763,95"],
+        "cols": ["346,381,419,458,498,536,576,614,652,692,732"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
     # NF3
-    '261': {  # 261
-        "area": ['70,490,768,100'],
-        "cols": ['364,412,454,496,538,581,623,667,710'],
+    "261": {  # 261
+        "area": ["70,490,768,100"],
+        "cols": ["364,412,454,496,538,581,623,667,710"],
     },
-    '262': {  # 262
-        "area": ['70,495,763,95'],
-        "cols": ['376,420,462,504,545,591,633,676,718'],
+    "262": {  # 262
+        "area": ["70,495,763,95"],
+        "cols": ["376,420,462,504,545,591,633,676,718"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '264': {  # 264
-        "area": ['70,495,763,95'],
-        "cols": ['370,415,451,491,530,569,609,651,689,729'],
+    "264": {  # 264
+        "area": ["70,495,763,95"],
+        "cols": ["370,415,451,491,530,569,609,651,689,729"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
-    '266': {  # 266
-        "area": ['70,495,763,95'],
-        "cols": ['355,392,430,467,505,544,580,619,656,695,732'],
+    "266": {  # 266
+        "area": ["70,495,763,95"],
+        "cols": ["355,392,430,467,505,544,580,619,656,695,732"],
         "rows_to_fix": {
-            3: ['2F Product Uses as Substitutes for Ozone Depleting'],
+            3: ["2F Product Uses as Substitutes for Ozone Depleting"],
         },
     },
 }
 
 table_defs = {
-    '203': {"template": '203', "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
-    '204': {"template": '204', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '205': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '206': {"template": '206', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '207': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '208': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '209': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '210': {"template": '206', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '211': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '212': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '213': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '214': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '215': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '216': {"template": '216', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '217': {"template": '207', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '218': {"template": '205', "entity": "CO2", "unit": "Gg CO2 / yr"},
-    '219': {"template": '219', "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
-    '220': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '221': {"template": '221', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '222': {"template": '222', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '223': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '224': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '225': {"template": '225', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '226': {"template": '226', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '227': {"template": '207', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '228': {"template": '220', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '229': {"template": '229', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '230': {"template": '226', "entity": "CH4", "unit": "Gg CH4 / yr"},
-    '231': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
-    '232': {"template": '232', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '233': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '234': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '235': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '236': {"template": '236', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '237': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '238': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '239': {"template": '207', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '240': {"template": '240', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '241': {"template": '233', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '242': {"template": '234', "entity": "N2O", "unit": "Gg N2O / yr"},
-    '243': {"template": '243', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # HFCs
-    '244': {"template": '244', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '245': {"template": '245', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '246': {"template": '245', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '247': {"template": '247', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '248': {"template": '247', "entity": f"HFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '249': {"template": '203', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # PFCs
-    '250': {"template": '250', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '251': {"template": '207', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '252': {"template": '252', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '253': {"template": '253', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '254': {"template": '254', "entity": f"PFCS ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '255': {"template": '219', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # SF6
-    '256': {"template": '256', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '257': {"template": '207', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '258': {"template": '258', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '259': {"template": '207', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '260': {"template": '260', "entity": f"SF6 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '261': {"template": '261', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},  # NF3
-    '262': {"template": '262', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '263': {"template": '207', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '264': {"template": '264', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '265': {"template": '207', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
-    '266': {"template": '266', "entity": f"NF3 ({gwp_to_use})",
-            "unit": "Gg CO2 / yr"},
+    "203": {"template": "203", "entity": "CO2", "unit": "Gg CO2 / yr"},  # CO2
+    "204": {"template": "204", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "205": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "206": {"template": "206", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "207": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "208": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "209": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "210": {"template": "206", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "211": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "212": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "213": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "214": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "215": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "216": {"template": "216", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "217": {"template": "207", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "218": {"template": "205", "entity": "CO2", "unit": "Gg CO2 / yr"},
+    "219": {"template": "219", "entity": "CH4", "unit": "Gg CH4 / yr"},  # CH4
+    "220": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "221": {"template": "221", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "222": {"template": "222", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "223": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "224": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "225": {"template": "225", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "226": {"template": "226", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "227": {"template": "207", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "228": {"template": "220", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "229": {"template": "229", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "230": {"template": "226", "entity": "CH4", "unit": "Gg CH4 / yr"},
+    "231": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},  # N2O
+    "232": {"template": "232", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "233": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "234": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "235": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "236": {"template": "236", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "237": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "238": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "239": {"template": "207", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "240": {"template": "240", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "241": {"template": "233", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "242": {"template": "234", "entity": "N2O", "unit": "Gg N2O / yr"},
+    "243": {
+        "template": "243",
+        "entity": f"HFCS ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # HFCs
+    "244": {"template": "244", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "245": {"template": "245", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "246": {"template": "245", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "247": {"template": "247", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "248": {"template": "247", "entity": f"HFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "249": {
+        "template": "203",
+        "entity": f"PFCS ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # PFCs
+    "250": {"template": "250", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "251": {"template": "207", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "252": {"template": "252", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "253": {"template": "253", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "254": {"template": "254", "entity": f"PFCS ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "255": {
+        "template": "219",
+        "entity": f"SF6 ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # SF6
+    "256": {"template": "256", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "257": {"template": "207", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "258": {"template": "258", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "259": {"template": "207", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "260": {"template": "260", "entity": f"SF6 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "261": {
+        "template": "261",
+        "entity": f"NF3 ({gwp_to_use})",
+        "unit": "Gg CO2 / yr",
+    },  # NF3
+    "262": {"template": "262", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "263": {"template": "207", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "264": {"template": "264", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "265": {"template": "207", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
+    "266": {"template": "266", "entity": f"NF3 ({gwp_to_use})", "unit": "Gg CO2 / yr"},
 }
 
 country_processing_step1 = {
-    'aggregate_cats': {
-        'M.3.C.AG': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                 '3.C.6', '3.C.7', '3.C.8'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land '
-                             '(Agriculture)'},
-        'M.3.D.AG': {'sources': ['3.D.2'],
-                     'name': 'Other (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG', 'M.3.D.AG'],
-                     'name': 'Agriculture excluding livestock'},
-        'M.AG': {'sources': ['3.A', 'M.AG.ELV'],
-                     'name': 'Agriculture'},
-        'M.3.D.LU': {'sources': ['3.D.1'],
-                     'name': 'Other (LULUCF)'},
-        'M.LULUCF': {'sources': ['3.B', 'M.3.D.LU'],
-                     'name': 'LULUCF'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'],
-                     'name': 'National total emissions excluding LULUCF'},
-    },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "aggregate_cats": {
+        "M.3.C.AG": {
+            "sources": [
+                "3.C.1",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+                "3.C.8",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land "
+            "(Agriculture)",
+        },
+        "M.3.D.AG": {"sources": ["3.D.2"], "name": "Other (Agriculture)"},
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG", "M.3.D.AG"],
+            "name": "Agriculture excluding livestock",
+        },
+        "M.AG": {"sources": ["3.A", "M.AG.ELV"], "name": "Agriculture"},
+        "M.3.D.LU": {"sources": ["3.D.1"], "name": "Other (LULUCF)"},
+        "M.LULUCF": {"sources": ["3.B", "M.3.D.LU"], "name": "LULUCF"},
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National total emissions excluding LULUCF",
+        },
+    },
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
 }
 
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }

+ 82 - 52
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py

@@ -1,8 +1,15 @@
-# this script reads data from Malaysia's BUR3
+"""
+Read Malaysia's BUR3 from pdf
+
+This script reads data from Malaysia's BUR3
+Data are read from pdf using camelot
+
+"""
+
 
 import camelot
 import primap2 as pm2
-from .config_mys_bur3 import (
+from config_mys_bur3 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_names_fix,
@@ -33,8 +40,8 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Malaysia' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Malaysia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Malaysia" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Malaysia"
     if not output_folder.exists():
         output_folder.mkdir()
 
@@ -47,7 +54,7 @@ if __name__ == "__main__":
     # SF6: 234 - 237
     # NF3: 238 - 241
 
-    output_filename = 'MYS_BUR3_2020_'
+    output_filename = "MYS_BUR3_2020_"
     compression = dict(zlib=True, complevel=9)
 
     # ###
@@ -62,36 +69,44 @@ if __name__ == "__main__":
         area = table_def_templates[page_template_nr]["area"]
         if "cols" in table_def_templates[page_template_nr].keys():
             cols = table_def_templates[page_template_nr]["cols"]
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area, columns=cols,
-                                      split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
         else:
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+            )
 
         df_current = tables[0].df.copy()
-        df_current.iloc[0,0] = 'Categories'
+        df_current.iloc[0, 0] = "Categories"
         df_current.columns = df_current.iloc[0]
         df_current = df_current.drop(0)
         # replace double \n
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("\n", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
         # replace double and triple spaces
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("   ", " ")
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("  ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
         # fix the split rows
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[page_template_nr]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[page_template_nr]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
         # replace category names with typos
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].replace(cat_names_fix)
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
 
         # replace empty stings
         df_current = df_current.replace(values_replacement)
@@ -106,7 +121,7 @@ if __name__ == "__main__":
         for col in cols_for_space_stripping:
             df_current[col] = df_current[col].str.strip()
 
-        # print(df_current.columns.values)
+        # print(df_current.columns.to_numpy())
 
         # aggregate dfs
         if df_all is None:
@@ -118,10 +133,11 @@ if __name__ == "__main__":
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             # print(cols_both)
             if len(cols_both) > 0:
-                df_all = df_all.merge(df_current, how='outer', on=cols_both,
-                                      suffixes=(None, None))
+                df_all = df_all.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
             else:
-                df_all = df_all.merge(df_current, how='outer', suffixes=(None, None))
+                df_all = df_all.merge(df_current, how="outer", suffixes=(None, None))
             df_all = df_all.groupby(index_cols).first().reset_index()
             # df_all = df_all.join(df_current, how='outer')
 
@@ -137,28 +153,38 @@ if __name__ == "__main__":
     # replace cat names by codes in col "Categories"
     # first the manual replacements
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
+
     # then the regex repalcements
-    def repl(m):
-        return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-    df_all["Categories"] = df_all["Categories"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+    df_all["Categories"] = df_all["Categories"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
 
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
 
     # remove thousands separators as pd.to_numeric can't deal with that
     # also replace None with NaN
-    year_cols = list(set(df_all.columns) - set(['Categories', 'entity', 'unit', 'orig_cat_name']))
+    year_cols = list(
+        set(df_all.columns) - set(["Categories", "entity", "unit", "orig_cat_name"])
+    )
     for col in year_cols:
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
-        def repl(m):
-            return m.group('part1') + m.group('part2')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_all[col][df_all[col].isnull()] = 'NaN'
+
+        def repl(m):  # noqa: D103
+            return m.group("part1") + m.group("part2")
+
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_all[col][df_all[col].isna()] = "NaN"
         # manually map code NENO to nan
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('NENO','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('O NANaN','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NO','0')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NA NO I','0')
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("NENO", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("O NANaN", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NO", "0")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NA NO I", "0")
         # TODO: add code to PRIMAP2
 
     # drop orig_cat_name as it's non-unique per category
@@ -167,17 +193,17 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
-        #coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
@@ -190,12 +216,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     # ###
     # ## process the data
@@ -211,9 +240,9 @@ if __name__ == "__main__":
     )
 
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -222,9 +251,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 84 - 55
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py

@@ -1,10 +1,16 @@
-# this script reads data from Malaysia's BUR4
-# code ist mostly identical to BUR3
+"""
+Read Malaysia's BUR3 from pdf
+
+This script reads data from Malaysia's BUR3
+Data are read from pdf using camelot
+
+Code ist mostly identical to BUR3
+"""
 
 
 import camelot
 import primap2 as pm2
-from .config_mys_bur4 import (
+from config_mys_bur4 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_names_fix,
@@ -35,8 +41,8 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Malaysia' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Malaysia'
+    input_folder = downloaded_data_path / "UNFCCC" / "Malaysia" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Malaysia"
     if not output_folder.exists():
         output_folder.mkdir()
 
@@ -50,7 +56,7 @@ if __name__ == "__main__":
     # SF6: 255 - 260
     # NF3: 261 - 266
 
-    output_filename = 'MYS_BUR4_2022_'
+    output_filename = "MYS_BUR4_2022_"
     compression = dict(zlib=True, complevel=9)
 
     # ###
@@ -65,36 +71,44 @@ if __name__ == "__main__":
         area = table_def_templates[page_template_nr]["area"]
         if "cols" in table_def_templates[page_template_nr].keys():
             cols = table_def_templates[page_template_nr]["cols"]
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area, columns=cols,
-                                      split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
         else:
-            tables = camelot.read_pdf(str(input_folder / pdf_file), pages=str(page), \
-                                      flavor='stream', table_areas=area)
+            tables = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+            )
 
         df_current = tables[0].df.copy()
-        df_current.iloc[0,0] = 'Categories'
+        df_current.iloc[0, 0] = "Categories"
         df_current.columns = df_current.iloc[0]
         df_current = df_current.drop(0)
         # replace double \n
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("\n", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
         # replace double and triple spaces
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("   ", " ")
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].str.replace("  ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+        df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
         # fix the split rows
         if "rows_to_fix" in table_def_templates[page_template_nr].keys():
             for n_rows in table_def_templates[page_template_nr]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[page_template_nr]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[page_template_nr]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
         # replace category names with typos
-        df_current[index_cols[0]] = \
-            df_current[index_cols[0]].replace(cat_names_fix)
+        df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
 
         # replace empty stings
         df_current = df_current.replace(values_replacement)
@@ -109,22 +123,23 @@ if __name__ == "__main__":
         for col in cols_for_space_stripping:
             df_current[col] = df_current[col].str.strip()
 
-        # print(df_current.columns.values)
+        # print(df_current.columns.to_numpy())
 
         # aggregate dfs
         if df_all is None:
             df_all = df_current
         else:
             # find intersecting cols
-            cols_all = df_all.columns.values
-            cols_current = df_current.columns.values
+            cols_all = df_all.columns.to_numpy()
+            cols_current = df_current.columns.to_numpy()
             cols_both = list(set(cols_all).intersection(set(cols_current)))
             # print(cols_both)
             if len(cols_both) > 0:
-                df_all = df_all.merge(df_current, how='outer', on=cols_both,
-                                      suffixes=(None, None))
+                df_all = df_all.merge(
+                    df_current, how="outer", on=cols_both, suffixes=(None, None)
+                )
             else:
-                df_all = df_all.merge(df_current, how='outer', suffixes=(None, None))
+                df_all = df_all.merge(df_current, how="outer", suffixes=(None, None))
             df_all = df_all.groupby(index_cols).first().reset_index()
             # df_all = df_all.join(df_current, how='outer')
 
@@ -140,28 +155,38 @@ if __name__ == "__main__":
     # replace cat names by codes in col "Categories"
     # first the manual replacements
     df_all["Categories"] = df_all["Categories"].replace(cat_codes_manual)
+
     # then the regex repalcements
-    def repl(m):
-        return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-    df_all["Categories"] = df_all["Categories"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+    df_all["Categories"] = df_all["Categories"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
 
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
 
     # remove thousands separators as pd.to_numeric can't deal with that
     # also replace None with NaN
-    year_cols = list(set(df_all.columns) - set(['Categories', 'entity', 'unit', 'orig_cat_name']))
+    year_cols = list(
+        set(df_all.columns) - set(["Categories", "entity", "unit", "orig_cat_name"])
+    )
     for col in year_cols:
         df_all.loc[:, col] = df_all.loc[:, col].str.strip()
-        def repl(m):
-            return m.group('part1') + m.group('part2')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_all[col][df_all[col].isnull()] = 'NaN'
+
+        def repl(m):  # noqa: D103
+            return m.group("part1") + m.group("part2")
+
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_all[col][df_all[col].isna()] = "NaN"
         # manually map code NENO to nan
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('NENO','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('O NANaN','NaN')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NO','0')
-        df_all.loc[:, col] = df_all.loc[:, col].str.replace('IE NA NO I','0')
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("NENO", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("O NANaN", "NaN")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NO", "0")
+        df_all.loc[:, col] = df_all.loc[:, col].str.replace("IE NA NO I", "0")
         # TODO: add code to PRIMAP2
 
     # drop orig_cat_name as it's non-unique per category
@@ -170,17 +195,17 @@ if __name__ == "__main__":
     data_if = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
-        #coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
@@ -193,12 +218,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     # ###
     # ## process the data
@@ -214,9 +242,9 @@ if __name__ == "__main__":
     )
 
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -225,9 +253,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Mexico/__init__.py

@@ -0,0 +1,30 @@
+"""Read Mexico's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MEX'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MEX
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 81 - 35
src/unfccc_ghg_data/unfccc_reader/Mexico/config_mex_bur3.py

@@ -1,8 +1,42 @@
+"""Config for Mexico's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 import pandas as pd
 
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
-             n_rows: int) -> pd.DataFrame:
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
+    Combine split rows
+
+    This function combines rows which have been split into several rows during data
+    reading from pdf because they contained line breaks.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        The data to work with
+    rows_to_fix: list
+        List of values for which to fix rows
+    col_to_use: str
+        column to use to find the rows to merge
+    n_rows: int
+        How many rows to combine for each row found. e.g. 3 means combine the found
+        row with the following two rows. Negative values are used for more
+        complicated situations where the rows to merge are also before the position
+        of the value that indicates the merge. See code for details
+
+    Returns
+    -------
+        pandas DataFrame with combined rows. The individual rows are removed
+
+    TODO: move function to helper module (make sure to have one function that works
+     for all cases)
+    """
     for row in rows_to_fix:
         # print(row)
         # find the row number and collect the row and the next two rows
@@ -16,29 +50,29 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
         for item in index:
             loc = data.index.get_loc(item)
             ####print(data[col_to_use].loc[loc + 1])
-            if n_rows == -2:
+            if n_rows == -2:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 loc_to_check = loc - 1
-            if n_rows == -6:
+            elif n_rows == -6:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 3, loc + 3))
                 loc_to_check = loc - 3
-            elif n_rows == -3:
+            elif n_rows == -3:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 2))
                 loc_to_check = loc - 1
             else:
                 locs_to_merge = list(range(loc, loc + n_rows))
                 loc_to_check = loc + 1
 
-            if (data[col_to_use].loc[loc_to_check] == '') or n_rows == 2:
+            if (not data[col_to_use].loc[loc_to_check]) or n_rows == 2:  # noqa: PLR2004
                 rows_to_merge = data.iloc[locs_to_merge]
                 indices_to_merge = rows_to_merge.index
                 # replace numerical NaN values
                 ####print(rows_to_merge)
-                rows_to_merge = rows_to_merge.fillna('')
+                rows_to_merge = rows_to_merge.fillna("")
                 ####print("fillna")
                 ####print(rows_to_merge)
                 # join the three rows
-                new_row = rows_to_merge.agg(' '.join)
+                new_row = rows_to_merge.agg(" ".join)
                 # replace the double spaces that are created
                 # must be done here and not at the end as splits are not always
                 # the same and join would produce different col values
@@ -54,67 +88,77 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str,
         data = data.reset_index(drop=True)
     return data
 
+
 page_defs = {
-    '118': {
+    "118": {
         "camelot": {
-            "table_areas": ['49,602,551,73'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,602,551,73"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "flavor": "stream",
         },
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
-            3: ["Todas las emisiones y las absorciones",
+            3: [
+                "Todas las emisiones y las absorciones",
                 "Todas las emisiones (sin [3B] Tierra ni",
                 "[1A] Actividades de quema del",
                 "[1A2] Industrias manufactura y de la",
                 "[1B] Emisiones fugitivas provenientes de",
-                "[2] Procesos industriales y uso de"],
+                "[2] Procesos industriales y uso de",
+            ],
         },
     },
-    '119': {
+    "119": {
         "camelot": {
-            "table_areas": ['49,650,551,77'],
-            "columns": ['228,275,317,352,394,421,446,483'],
+            "table_areas": ["49,650,551,77"],
+            "columns": ["228,275,317,352,394,421,446,483"],
             "split_text": True,
             "flavor": "stream",
         },
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
-            3: ["[2B4] Producción de caprolactama,",
+            3: [
+                "[2B4] Producción de caprolactama,",
                 "[2B8] Producción petroquímica y negro",
                 "[2D] Uso de productos no energéticos de",
-                "[2E1] Circuitos integrados o"],
+                "[2E1] Circuitos integrados o",
+            ],
         },
     },
-    '120': {
+    "120": {
         "camelot": {
-            "table_areas": ['49,650,551,77'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,77"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "flavor": "stream",
         },
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -3: ["[3B] Tierra"],
-            3: ["[2F] Uso de productos sustitutos de las",
+            3: [
+                "[2F] Uso de productos sustitutos de las",
                 "[2G] Manufactura y utilización de otros",
-                "[3] Agricultura, silvicultura y otros usos"],
-            2: ["[2H2] Industria de la alimentación y las",
-                "[2G2] SF₆ y PFC de otros usos de"],
+                "[3] Agricultura, silvicultura y otros usos",
+            ],
+            2: [
+                "[2H2] Industria de la alimentación y las",
+                "[2G2] SF₆ y PFC de otros usos de",
+            ],
         },
     },
-    '121': {
+    "121": {
         "camelot": {
-            "table_areas": ['49,650,551,70'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,70"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "flavor": "stream",
         },
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
             -3: ["[3B1] Tierra forestales"],
-            3: ["[3C] Fuentes agregadas y fuentes de",
+            3: [
+                "[3C] Fuentes agregadas y fuentes de",
                 "[3C1] Emisiones de GEI por quemado de",
                 "[3C4] Emisiones directas de los N₂O de",
                 "[3C5] Emisiones indirectas de los N₂O de",
@@ -123,24 +167,26 @@ page_defs = {
                 "[4A2] Sitios no controlados de",
                 "[4A3] Tiraderos a cielo abierto para",
                 "[4B] Tratamiento biológico de los",
-                ],
+            ],
         },
     },
-    '122': {
+    "122": {
         "camelot": {
-            "table_areas": ['49,650,551,404'],
-            "columns": ['223,277,314,348,392,422,446,483'],
+            "table_areas": ["49,650,551,404"],
+            "columns": ["223,277,314,348,392,422,446,483"],
             "split_text": False,
             "flavor": "stream",
         },
         "rows_to_fix": {
             -6: ["Categorías de fuentes y"],
-            3: ["[4C] Incineración y quema a cielo abierto",
+            3: [
+                "[4C] Incineración y quema a cielo abierto",
                 "[4C1] Incineración de residuos peligrosos",
                 "[4C2] Quema a cielo abierto de residuos",
                 "[4D] Tratamiento y eliminación de aguas",
                 "[4D1] Tratamiento y eliminación de",
-                "[4D2] Tratamiento y eliminación de"],
+                "[4D2] Tratamiento y eliminación de",
+            ],
         },
     },
 }

+ 63 - 66
src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py

@@ -1,10 +1,15 @@
-# this script reads data from Mexico's BUR3
-# Data is read from the pdf file
+"""
+Read Mexico's BUR3 from pdf
+
+This script reads data from Mexico's BUR3
+Data are read from pdf using camelot
+
+"""
 
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_mex_bur3 import fix_rows, page_defs
+from config_mex_bur3 import fix_rows, page_defs
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
@@ -12,16 +17,16 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
+    input_folder = downloaded_data_path / "UNFCCC" / "Mexico" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Mexico"
     if not output_folder.exists():
-       output_folder.mkdir()
+        output_folder.mkdir()
 
-    output_filename = 'MEX_BUR3_2022_'
+    output_filename = "MEX_BUR3_2022_"
     compression = dict(zlib=True, complevel=9)
-    inventory_file = 'Mexico_3er_BUR.pdf'
+    inventory_file = "Mexico_3er_BUR.pdf"
 
-    gwp_to_use = 'AR5GWP100'
+    gwp_to_use = "AR5GWP100"
     year = 2019
     entity_row = 0
     unit_row = 1
@@ -43,12 +48,12 @@ if __name__ == "__main__":
 
     # manual category codes
     cat_codes_manual = {
-        'Todas las emisiones y las absorciones nacionales': '0',
-        'Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada': 'M0EL',
-        '2F6 Otras aplicaciones': '2F6',
+        "Todas las emisiones y las absorciones nacionales": "0",
+        "Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada": "M0EL",
+        "2F6 Otras aplicaciones": "2F6",
     }
 
-    cat_code_regexp = r'^\[(?P<code>[a-zA-Z0-9]{1,3})\].*'
+    cat_code_regexp = r"^\[(?P<code>[a-zA-Z0-9]{1,3})\].*"
 
     coords_cols = {
         "category": "category",
@@ -77,18 +82,17 @@ if __name__ == "__main__":
         "unit": "PRIMAP1",
         "category": "PRIMAP1",
         "entity": {
-            'CH₄': 'CH4',
-            'CO₂': 'CO2',
-            'EMISIONES NETAS PCG AR5': 'KYOTOGHG (AR5GWP100)',
-            'HFC': f"HFCS ({gwp_to_use})",
-            'NF₃': f"NF3 ({gwp_to_use})",
-            'N₂O': 'N2O',
-            'PFC': f"PFCS ({gwp_to_use})",
-            'SF₆': f"SF6 ({gwp_to_use})",
+            "CH₄": "CH4",
+            "CO₂": "CO2",
+            "EMISIONES NETAS PCG AR5": "KYOTOGHG (AR5GWP100)",
+            "HFC": f"HFCS ({gwp_to_use})",
+            "NF₃": f"NF3 ({gwp_to_use})",
+            "N₂O": "N2O",
+            "PFC": f"PFCS ({gwp_to_use})",
+            "SF₆": f"SF6 ({gwp_to_use})",
         },
     }
 
-
     filter_remove = {}
 
     filter_keep = {}
@@ -102,11 +106,6 @@ if __name__ == "__main__":
         "institution": "UNFCCC",
     }
 
-    # convert to mass units where possible
-    entities_to_convert_to_mass = [
-        'NF3', 'SF6'
-    ]
-
     # ###
     # read the data from pdf into one long format dataframe
     # ###
@@ -114,8 +113,9 @@ if __name__ == "__main__":
     for page in page_defs.keys():
         print(f"Working on page {page}")
         page_def = page_defs[page]
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=page,
-                                  **page_def["camelot"])
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=page, **page_def["camelot"]
+        )
         df_this_table = tables[0].df
 
         # fix rows
@@ -127,31 +127,36 @@ if __name__ == "__main__":
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
             # replace double space in entity
             df_this_table.iloc[0, :] = df_this_table.iloc[0, :].str.replace("  ", " ")
-            df_this_table = fix_rows(df_this_table, page_def["rows_to_fix"][n_rows], 0,
-                                     n_rows)
+            df_this_table = fix_rows(
+                df_this_table, page_def["rows_to_fix"][n_rows], 0, n_rows
+            )
 
         # add units
-        for col in df_this_table.columns.values:
+        for col in df_this_table.columns.to_numpy():
             if df_this_table[col].iloc[0] in units.keys():
                 df_this_table[col].iloc[1] = units[df_this_table[col].iloc[0]]
 
         # bring in right format for conversion to long format
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_unit=".*",
-                                                           regexp_entity=".*",
-                                                           default_unit="GgCO2eq")
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_unit=".*",
+            regexp_entity=".*",
+            default_unit="GgCO2eq",
+        )
 
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year, header_long
+        )
 
         # combine with tables for other sectors (merge not append)
         if df_all is None:
             df_all = df_this_table_long
         else:
-            df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+            df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
     # ###
     # conversion to PM2 IF
@@ -162,15 +167,19 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
-    def repl(m):
-       return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
 
     # replace "," and " " with "" in data
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(',','', regex=False)
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ','', regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(",", "", regex=False)
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(" ", "", regex=False)
 
     # make sure all col headers are str
     df_all.columns = df_all.columns.map(str)
@@ -185,12 +194,13 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+        time_format="%Y",
+    )
 
     cat_label = "category (IPCC2006)"
     # fix error cats
@@ -198,21 +208,6 @@ if __name__ == "__main__":
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
-    # convert to mass units from CO2eq
-
-    entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in
-                           entities_to_convert_to_mass]
-
-    for entity in entities_to_convert:
-        converted = data_pm2[entity].pr.convert_to_mass()
-        basic_entity = entity.split(" ")[0]
-        converted = converted.to_dataset(name=basic_entity)
-        data_pm2 = data_pm2.pr.merge(converted)
-        data_pm2[basic_entity].attrs["entity"] = basic_entity
-
-    # drop the GWP data
-    data_pm2 = data_pm2.drop_vars(entities_to_convert)
-
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
 
@@ -222,9 +217,11 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Montenegro/__init__.py

@@ -0,0 +1,30 @@
+"""Read Montenegro's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MNE'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MNE
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 103 - 47
src/unfccc_ghg_data/unfccc_reader/Montenegro/config_mne_bur3.py

@@ -1,67 +1,123 @@
+"""Config for Montenegro's BUR3
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 # most time series are contained twice and 2005 data is also contained twice. Some
 # data is inconsistent and we remove the time series with errors
 drop_data = {
-    2: { # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
-        "cats": ["1", "1.A", "1.A.1", "1.A.1", "1.A.2", "1.A.3", "1.A.4", "1.A.5", "1.B", "1.B.1", "1.B.2",
-                 "2", "2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H",
-                 "3", "3.A", "3.B"],
-        #"years": ["2005"], # 2005 data copy of 2019
+    2: {  # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
+        "cats": [
+            "1",
+            "1.A",
+            "1.A.1",
+            "1.A.1",
+            "1.A.2",
+            "1.A.3",
+            "1.A.4",
+            "1.A.5",
+            "1.B",
+            "1.B.1",
+            "1.B.2",
+            "2",
+            "2.A",
+            "2.B",
+            "2.C",
+            "2.D",
+            "2.E",
+            "2.F",
+            "2.G",
+            "2.H",
+            "3",
+            "3.A",
+            "3.B",
+        ],
+        # "years": ["2005"], # 2005 data copy of 2019
     },
-    3: { # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
-        "cats": ["3.C", "3.D", "3.E", "3.F", "3.G", "5", "5.A", "5.B", "5.C", "5.D", "6"]
-        #"years": ["2005"],
+    3: {  # individual sector time series are (mostly) wrong, leave only 0.EL timeseries
+        "cats": [
+            "3.C",
+            "3.D",
+            "3.E",
+            "3.F",
+            "3.G",
+            "5",
+            "5.A",
+            "5.B",
+            "5.C",
+            "5.D",
+            "6",
+        ]
+        # "years": ["2005"],
     },
-    6: { #2005 data copy of 2019
+    6: {  # 2005 data copy of 2019
         "years": ["2005"],
     },
-    7: { # 2005 data copy of 2019 for 3.G
+    7: {  # 2005 data copy of 2019 for 3.G
         "years": ["2005"],
     },
-    25: { # 2005 data copy of 2019 (CO2, 2005-2019, first table)
+    25: {  # 2005 data copy of 2019 (CO2, 2005-2019, first table)
         "years": ["2005"],
     },
-    26: { # 2005 data copy of 2019 (CO2, 2005-2019, second table)
+    26: {  # 2005 data copy of 2019 (CO2, 2005-2019, second table)
         "years": ["2005"],
     },
 }
 
 cat_mapping = {
-    '3': 'M.AG',
-    '3.A': '3.A.1',
-    '3.B': '3.A.2',
-    '3.C': '3.C.7', # rice
-    '3.D': 'M.3.C.45AG', # Agricultural soils
-    '3.E': '3.C.1.c', # prescribed burning of savanna
-    '3.F': '3.C.1.b', # field burning of agricultural residues
-    '3.G': '3.C.3', # urea application
-    '4': 'M.LULUCF',
-    '4.A': '3.B.1', # forest
-    '4.B': '3.B.2', # cropland
-    '4.C': '3.B.3', # grassland
-    '4.D': '3.B.4', # wetland
-    '4.E': '3.B.5', # Settlements
-    '4.F': '3.B.6', # other land
-    '4.G': '3.D.1', # HWP
-    '5': '4',
-    '5.A': '4.A',
-    '5.B': '4.B',
-    '5.C': '4.C',
-    '5.D': '4.D',
-    '6': '5',
+    "3": "M.AG",
+    "3.A": "3.A.1",
+    "3.B": "3.A.2",
+    "3.C": "3.C.7",  # rice
+    "3.D": "M.3.C.45AG",  # Agricultural soils
+    "3.E": "3.C.1.c",  # prescribed burning of savanna
+    "3.F": "3.C.1.b",  # field burning of agricultural residues
+    "3.G": "3.C.3",  # urea application
+    "4": "M.LULUCF",
+    "4.A": "3.B.1",  # forest
+    "4.B": "3.B.2",  # cropland
+    "4.C": "3.B.3",  # grassland
+    "4.D": "3.B.4",  # wetland
+    "4.E": "3.B.5",  # Settlements
+    "4.F": "3.B.6",  # other land
+    "4.G": "3.D.1",  # HWP
+    "5": "4",
+    "5.A": "4.A",
+    "5.B": "4.B",
+    "5.C": "4.C",
+    "5.D": "4.D",
+    "6": "5",
 }
 
 aggregate_cats = {
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    'M.3.C.1.AG': {'sources': ['3.C.1.c', '3.C.1.b'], 'name': 'Emissions from Biomass '
-                                                          'Burning (Agriculture)'},
-    '3.C.1': {'sources': ['3.C.1.c', '3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
-    '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.C.45AG', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.1.AG', '3.C.3', 'M.3.C.45AG', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
-    '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    '0': {'sources': ['1', '2', '3', '4', '5']},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "M.3.C.1.AG": {
+        "sources": ["3.C.1.c", "3.C.1.b"],
+        "name": "Emissions from Biomass " "Burning (Agriculture)",
+    },
+    "3.C.1": {
+        "sources": ["3.C.1.c", "3.C.1.b"],
+        "name": "Emissions from Biomass Burning",
+    },
+    "3.C": {
+        "sources": ["3.C.1", "3.C.3", "M.3.C.45AG", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.1.AG", "3.C.3", "M.3.C.45AG", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "3.D": {"sources": ["3.D.1"], "name": "Other"},
+    "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "0": {"sources": ["1", "2", "3", "4", "5"]},
 }

+ 88 - 56
src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py

@@ -1,41 +1,41 @@
-# Montenegro BUR 3
-# Code to read the emissions inventory contained in Montenegro's third BUR from pdf
-# and convert into PRIMAP2 format
+"""
+Read Montenegro's BUR3 from pdf
+
+This script reads data from Montenegro's BUR3
+Data are read from pdf using camelot
+
+"""
+
 
 # ###
 # imports
 # ###
 import copy
 import re
-from pathlib import Path
 
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
+from config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
 from primap2.pm2io._data_reading import matches_time_format
 
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
 if __name__ == "__main__":
     # ###
     # configuration
     # ###
 
-    # folders and files
-    root_path = Path(__file__).parents[3].absolute()
-    root_path = root_path.resolve()
-    downloaded_data_path = root_path / "downloaded_data"
-    extracted_data_path = root_path / "extracted_data"
-
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Montenegro' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Montenegro'
-    output_filename = 'MNE_BUR3_2022_'
+    input_folder = downloaded_data_path / "UNFCCC" / "Montenegro" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Montenegro"
+    output_filename = "MNE_BUR3_2022_"
     compression = dict(zlib=True, complevel=9)
 
-    inventory_file_pdf = 'NIR-2021_MNE_Finalversion.pdf'
+    inventory_file_pdf = "NIR-2021_MNE_Finalversion.pdf"
 
     # reading and processing
     years_to_read = range(1990, 2018 + 1)
-    pages_to_read = range(535,583)
+    pages_to_read = range(535, 583)
 
     pos_entity = [0, 0]
     cat_code_col = 0
@@ -43,7 +43,7 @@ if __name__ == "__main__":
     regex_unit = r"\((.*)\)"
     regex_entity = r"^(.*)\s\("
 
-    gwp_to_use = 'AR4GWP100'
+    gwp_to_use = "AR4GWP100"
 
     # conversion to PRIMAP2 format
 
@@ -61,28 +61,28 @@ if __name__ == "__main__":
     }
 
     coords_value_mapping = {
-        'unit': 'PRIMAP1',
-        'entity': {
+        "unit": "PRIMAP1",
+        "entity": {
             f"GHG ({gwp_to_use})": f"KYOTOGHG ({gwp_to_use})",
             f"HFC ({gwp_to_use})": f"HFCS ({gwp_to_use})",
             f"PFC ({gwp_to_use})": f"PFCS ({gwp_to_use})",
         },
-        'category': {
-            'Total national GHG emissions (with LULUCF)': '0',
-            'Total national GHG emissions (without LULUCF)': 'M.0.EL',
-            'International Bunkers': 'M.BK',
-            '1.A.3.a.i': 'M.BK.A',
-            '1.A.3.d.i': 'M.BK.M',
-            'CO2 from Biomass Combustion for Energy Production': 'M.BIO',
-            '6 Other': '6',
-            '2 H': '2.H',
+        "category": {
+            "Total national GHG emissions (with LULUCF)": "0",
+            "Total national GHG emissions (without LULUCF)": "M.0.EL",
+            "International Bunkers": "M.BK",
+            "1.A.3.a.i": "M.BK.A",
+            "1.A.3.d.i": "M.BK.M",
+            "CO2 from Biomass Combustion for Energy Production": "M.BIO",
+            "6 Other": "6",
+            "2 H": "2.H",
         },
     }
 
     coords_value_filling = {
         "category": {
             "orig_cat_name": {
-                'International Bunkers': 'M.BK',
+                "International Bunkers": "M.BK",
             },
         },
     }
@@ -103,7 +103,8 @@ if __name__ == "__main__":
         "references": "https://unfccc.int/documents/461972",
         "rights": "",
         "contact": "mail@johannes-guetschow.de",
-        "title": "Montenegro. Biennial update report (BUR). BUR 3. National inventory report.",
+        "title": "Montenegro. Biennial update report (BUR). "
+        "BUR 3. National inventory report.",
         "comment": "Read fom pdf file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
@@ -111,7 +112,11 @@ if __name__ == "__main__":
     # ###
     # Read all time series table from pdf
     # ###
-    tables = camelot.read_pdf(str(input_folder / inventory_file_pdf), pages=','.join([str(page) for page in pages_to_read]), flavor='lattice')
+    tables = camelot.read_pdf(
+        str(input_folder / inventory_file_pdf),
+        pages=",".join([str(page) for page in pages_to_read]),
+        flavor="lattice",
+    )
 
     # ###
     # process tables and combine them using the pm2 pr.merge function
@@ -142,11 +147,14 @@ if __name__ == "__main__":
 
         # remove ',' in numbers
         years = df_current_table.columns[2:]
-        def repl(m):
+
+        def repl(m):  # noqa: D103
             return m.group("part1") + m.group("part2")
+
         for year in years:
             df_current_table.loc[:, year] = df_current_table.loc[:, year].str.replace(
-                '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
+                "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+            )
 
         # add entity and unit cols
         df_current_table["entity"] = entity
@@ -156,13 +164,15 @@ if __name__ == "__main__":
             to_drop = drop_data[i]
             if "cats" in to_drop.keys():
                 mask = df_current_table["category"].isin(to_drop["cats"])
-                df_current_table = df_current_table.drop(df_current_table[mask].index,
-                                                         axis=0)
+                df_current_table = df_current_table.drop(
+                    df_current_table[mask].index, axis=0
+                )
             if "years" in to_drop.keys():
                 df_current_table = df_current_table.drop(columns=to_drop["years"])
 
         df_current_table["category"] = df_current_table["category"].fillna(
-            value=df_current_table["orig_cat_name"])
+            value=df_current_table["orig_cat_name"]
+        )
 
         df_current_table = df_current_table.drop(columns="orig_cat_name")
 
@@ -191,7 +201,7 @@ if __name__ == "__main__":
     # ###
 
     # convert to mass units from CO2eq
-    entities_to_convert = ['N2O', 'SF6', 'CH4']
+    entities_to_convert = ["N2O", "SF6", "CH4"]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
 
     # for entity in entities_to_convert:
@@ -215,21 +225,28 @@ if __name__ == "__main__":
 
     # map categories
     data_if_2006 = data_if_2006.replace(
-        {f"category ({coords_terminologies['category']})": cat_mapping})
+        {f"category ({coords_terminologies['category']})": cat_mapping}
+    )
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
 
     # rename the category col
-    data_if_2006.rename(columns={
-        f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
-                        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+        }
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == f"category ({coords_terminologies['category']})"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
     # aggregate categories
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         # print(df_test)
 
@@ -237,10 +254,10 @@ if __name__ == "__main__":
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -248,8 +265,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -257,7 +281,7 @@ if __name__ == "__main__":
 
             df_combine = df_combine.reset_index()
 
-            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join="outer")
             data_if_2006 = data_if_2006.reset_index(drop=True)
         else:
             print(f"no data to aggregate category {cat_to_agg}")
@@ -268,7 +292,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
-
     # ###
     # save data to IF and native format
     # ###
@@ -276,13 +299,22 @@ if __name__ == "__main__":
         output_folder.mkdir()
 
     # data in original categories
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_all.data_vars}
-    data_all.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_all.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     # data in 2006 categories
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Morocco/__init__.py

@@ -0,0 +1,30 @@
+"""Read Morocco's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'MAR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=MAR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 187 - 108
src/unfccc_ghg_data/unfccc_reader/Morocco/config_mar_bur3.py

@@ -1,57 +1,98 @@
+"""Config for Morocco's BUR3
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 # define which raw tables to combine
 table_defs = {
     2010: {
-        'Energy': [0, 1],
-        'Agriculture': [10],
-        'IPPU': [15, 16, 17],
-        'LULUCF': [30],
-        'Waste': [35],
+        "Energy": [0, 1],
+        "Agriculture": [10],
+        "IPPU": [15, 16, 17],
+        "LULUCF": [30],
+        "Waste": [35],
     },
     2012: {
-        'Energy': [2, 3],
-        'Agriculture': [11],
-        'IPPU': [18, 19, 20],
-        'LULUCF': [31],
-        'Waste': [36],
+        "Energy": [2, 3],
+        "Agriculture": [11],
+        "IPPU": [18, 19, 20],
+        "LULUCF": [31],
+        "Waste": [36],
     },
     2014: {
-        'Energy': [4, 5],
-        'Agriculture': [10],
-        'IPPU': [21, 22, 23],
-        'LULUCF': [32],
-        'Waste': [37],
+        "Energy": [4, 5],
+        "Agriculture": [10],
+        "IPPU": [21, 22, 23],
+        "LULUCF": [32],
+        "Waste": [37],
     },
     2016: {
-        'Energy': [6, 7],
-        'Agriculture': [10],
-        'IPPU': [24, 25, 26],
-        'LULUCF': [33],
-        'Waste': [38],
+        "Energy": [6, 7],
+        "Agriculture": [10],
+        "IPPU": [24, 25, 26],
+        "LULUCF": [33],
+        "Waste": [38],
     },
     2018: {
-        'Energy': [8, 9],
-        'Agriculture': [14],
-        'IPPU': [27, 28, 29],
-        'LULUCF': [34],
-        'Waste': [39],
+        "Energy": [8, 9],
+        "Agriculture": [14],
+        "IPPU": [27, 28, 29],
+        "LULUCF": [34],
+        "Waste": [39],
     },
 }
 
 header_defs = {
-    'Energy': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Agriculture': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'Gg', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']], # units are wrong
+    "Energy": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
+    ],
+    "Agriculture": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "Gg", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],  # units are wrong
     # in BUR pdf
-    'IPPU': [['Catégories', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'LULUCF': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
-    'Waste': [['Catégories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'COVNM', 'SO2'],
-        ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'Gg', 'Gg', 'Gg', 'Gg']],
+    "IPPU": [
+        [
+            "Catégories",
+            "CO2",
+            "CH4",
+            "N2O",
+            "HFCs",
+            "PFCs",
+            "SF6",
+            "NOx",
+            "CO",
+            "COVNM",
+            "SO2",
+        ],
+        [
+            "",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "GgCO2eq",
+            "Gg",
+            "Gg",
+            "Gg",
+            "Gg",
+        ],
+    ],
+    "LULUCF": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],
+    "Waste": [
+        ["Catégories", "CO2", "CH4", "N2O", "NOx", "CO", "COVNM", "SO2"],
+        ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "Gg", "Gg", "Gg", "Gg"],
+    ],
 }
 
-remove_cats = ['3.A.4', '3.B', '3.B.4', '1.B.2.a', '1.B.2.b', '1.B.2.c']
+remove_cats = ["3.A.4", "3.B", "3.B.4", "1.B.2.a", "1.B.2.b", "1.B.2.c"]
 
 cat_mapping = {
     "1.B.2.a.4": "1.B.2.a.iii.4",
@@ -61,81 +102,119 @@ cat_mapping = {
     "1.B.2.b.4": "1.B.2.b.iii.4",
     "1.B.2.b.5": "1.B.2.b.iii.5",
     "1.B.2.b.6": "1.B.2.b.iii.6",
-    "1.B.2.c.1": "1.B.2.b.i", # simplification, split to oil and gas ("1.B.2.X.i")
-    "1.B.2.c.2": "1.B.2.b.ii", # simplification, split to oil and gas ("1.B.2.X.ii")
-    '1.A.2.g': '1.A.2.m', # other industry
-    '3.A': '3.A.1', # enteric fermentation
-    '3.A.1': '3.A.1.a', # cattle
-    '3.A.1.a': '3.A.1.a.i',
-    '3.A.1.b': '3.A.1.a.ii',
-    '3.A.2': '3.A.1.c',
-    '3.A.3': '3.A.1.h', # Swine
-    '3.A.4.a': '3.A.1.d', # goats
-    '3.A.4.b': '3.A.1.e', # camels
-    '3.A.4.c': '3.A.1.f', # horses
-    '3.A.4.d': '3.A.1.g', # Mules and asses
-    '3.A.4.e': '3.A.1.i', # poultry
-#    '3.B': '3.A.2', # Manure Management
-    '3.B.1': '3.A.2.a', # cattle
-    '3.B.1.a': '3.A.2.a.i',
-    '3.B.1.b': '3.A.2.a.ii',
-    '3.B.2': '3.A.2.c', # Sheep
-    '3.B.3': '3.A.2.h', # Swine
-    '3.B.4.a': '3.A.2.d', # Goats
-    '3.B.4.b': '3.A.2.e', # Camels
-    '3.B.4.c': '3.A.2.f', # Horses
-    '3.B.4.d': '3.A.2.g', # Mules and Asses
-    '3.B.4.e': '3.A.2.i', # Poultry
-    '3.B.5': '3.C.6', # indirect N2O from manure management
-    '3.C': '3.C.7', # rice
-    '3.D': 'M.3.C.45AG', # Agricultural soils
-    '3.D.a': '3.C.4', #direct N2O from agri soils
-    '3.D.a.1': '3.C.4.a', # inorganic fertilizers
-    '3.D.a.2': '3.C.4.b', # organic fertilizers
-    '3.D.a.3': '3.C.4.c', # urine and dung by grazing animals
-    '3.D.a.4': '3.C.4.d', # N in crop residues
-    '3.D.b': '3.C.5', # indirect N2O from managed soils
-    '3.D.b.1': '3.C.5.a', # Atmospheric deposition
-    '3.D.b.2': '3.C.5.b', # nitrogen leeching and runoff
-    '3.H': '3.C.3', # urea application
-    'LU.3.B.1': '3.B.1', # forest
-    'LU.3.B.2': '3.B.2', # cropland
-    'LU.3.B.3': '3.B.3', # grassland
-    'LU.3.B.4': '3.B.4', # wetland
-    'LU.3.B.5': '3.B.5', # Settlements
-    'LU.3.B.6': '3.B.6', # other land
+    "1.B.2.c.1": "1.B.2.b.i",  # simplification, split to oil and gas ("1.B.2.X.i")
+    "1.B.2.c.2": "1.B.2.b.ii",  # simplification, split to oil and gas ("1.B.2.X.ii")
+    "1.A.2.g": "1.A.2.m",  # other industry
+    "3.A": "3.A.1",  # enteric fermentation
+    "3.A.1": "3.A.1.a",  # cattle
+    "3.A.1.a": "3.A.1.a.i",
+    "3.A.1.b": "3.A.1.a.ii",
+    "3.A.2": "3.A.1.c",
+    "3.A.3": "3.A.1.h",  # Swine
+    "3.A.4.a": "3.A.1.d",  # goats
+    "3.A.4.b": "3.A.1.e",  # camels
+    "3.A.4.c": "3.A.1.f",  # horses
+    "3.A.4.d": "3.A.1.g",  # Mules and asses
+    "3.A.4.e": "3.A.1.i",  # poultry
+    #    '3.B': '3.A.2', # Manure Management
+    "3.B.1": "3.A.2.a",  # cattle
+    "3.B.1.a": "3.A.2.a.i",
+    "3.B.1.b": "3.A.2.a.ii",
+    "3.B.2": "3.A.2.c",  # Sheep
+    "3.B.3": "3.A.2.h",  # Swine
+    "3.B.4.a": "3.A.2.d",  # Goats
+    "3.B.4.b": "3.A.2.e",  # Camels
+    "3.B.4.c": "3.A.2.f",  # Horses
+    "3.B.4.d": "3.A.2.g",  # Mules and Asses
+    "3.B.4.e": "3.A.2.i",  # Poultry
+    "3.B.5": "3.C.6",  # indirect N2O from manure management
+    "3.C": "3.C.7",  # rice
+    "3.D": "M.3.C.45AG",  # Agricultural soils
+    "3.D.a": "3.C.4",  # direct N2O from agri soils
+    "3.D.a.1": "3.C.4.a",  # inorganic fertilizers
+    "3.D.a.2": "3.C.4.b",  # organic fertilizers
+    "3.D.a.3": "3.C.4.c",  # urine and dung by grazing animals
+    "3.D.a.4": "3.C.4.d",  # N in crop residues
+    "3.D.b": "3.C.5",  # indirect N2O from managed soils
+    "3.D.b.1": "3.C.5.a",  # Atmospheric deposition
+    "3.D.b.2": "3.C.5.b",  # nitrogen leeching and runoff
+    "3.H": "3.C.3",  # urea application
+    "LU.3.B.1": "3.B.1",  # forest
+    "LU.3.B.2": "3.B.2",  # cropland
+    "LU.3.B.3": "3.B.3",  # grassland
+    "LU.3.B.4": "3.B.4",  # wetland
+    "LU.3.B.5": "3.B.5",  # Settlements
+    "LU.3.B.6": "3.B.6",  # other land
 }
 
 aggregate_cats = {
-    '1.B.2.a.iii': {'sources': ['1.B.2.a.iii.4', '1.B.2.a.iii.5', '1.B.2.a.iii.6'],
-                    'name': 'All Other'},
-    '1.B.2.b.iii': {'sources': ['1.B.2.b.iii.2', '1.B.2.b.iii.4', '1.B.2.b.iii.5',
-                                '1.B.2.b.iii.6',],
-                    'name': 'All Other'},
-    '1.B.2.a': {'sources': ['1.B.2.a.iii'], 'name': 'Oil'},
-    '1.B.2.b': {'sources': ['1.B.2.b.i', '1.B.2.b.ii', '1.B.2.b.iii'],
-                'name': 'Natural Gas'},
-    '2.D':  {'sources': ['2.D.4'], 'name': 'Non-Energy Products from Fuels and Solvent Use'},
-    '2.F.1':  {'sources': ['2.F.1.a', '2.F.1.b'], 'name': 'Refrigeration and Air Conditioning'},
-    '2.F':  {'sources': ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
-             'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-    '2.H':  {'sources': ["2.H.1", "2.H.2", "2.H.3"], 'name': 'Other'},
-    '3.A.2': {'sources': ['3.A.2.a', '3.A.2.c', '3.A.2.d', '3.A.2.e', '3.A.2.f',
-                          '3.A.2.g', '3.A.2.h', '3.A.2.i'],
-              'name': 'Manure Management'},
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.3', '3.B.4', '3.B.5', '3.B.6'], 'name': 'Land'},
-    '3.C': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    'M.AG': {'sources': ['3.A', 'M.3.C.AG'], 'name': 'Agriculture'},
-    '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    '4': {'sources': ['4.A', '4.D'], 'name': 'Waste'},
-    '0': {'sources': ['1', '2', '3', '4']},
-    'M.0.EL': {'sources': ['1', '2', 'M.AG', '4']},
+    "1.B.2.a.iii": {
+        "sources": ["1.B.2.a.iii.4", "1.B.2.a.iii.5", "1.B.2.a.iii.6"],
+        "name": "All Other",
+    },
+    "1.B.2.b.iii": {
+        "sources": [
+            "1.B.2.b.iii.2",
+            "1.B.2.b.iii.4",
+            "1.B.2.b.iii.5",
+            "1.B.2.b.iii.6",
+        ],
+        "name": "All Other",
+    },
+    "1.B.2.a": {"sources": ["1.B.2.a.iii"], "name": "Oil"},
+    "1.B.2.b": {
+        "sources": ["1.B.2.b.i", "1.B.2.b.ii", "1.B.2.b.iii"],
+        "name": "Natural Gas",
+    },
+    "2.D": {
+        "sources": ["2.D.4"],
+        "name": "Non-Energy Products from Fuels and Solvent Use",
+    },
+    "2.F.1": {
+        "sources": ["2.F.1.a", "2.F.1.b"],
+        "name": "Refrigeration and Air Conditioning",
+    },
+    "2.F": {
+        "sources": ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5", "2.F.6"],
+        "name": "Product uses as Substitutes for Ozone Depleting Substances",
+    },
+    "2.H": {"sources": ["2.H.1", "2.H.2", "2.H.3"], "name": "Other"},
+    "3.A.2": {
+        "sources": [
+            "3.A.2.a",
+            "3.A.2.c",
+            "3.A.2.d",
+            "3.A.2.e",
+            "3.A.2.f",
+            "3.A.2.g",
+            "3.A.2.h",
+            "3.A.2.i",
+        ],
+        "name": "Manure Management",
+    },
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.B": {
+        "sources": ["3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.B.6"],
+        "name": "Land",
+    },
+    "3.C": {
+        "sources": ["3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "M.AG": {"sources": ["3.A", "M.3.C.AG"], "name": "Agriculture"},
+    "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "4": {"sources": ["4.A", "4.D"], "name": "Waste"},
+    "0": {"sources": ["1", "2", "3", "4"]},
+    "M.0.EL": {"sources": ["1", "2", "M.AG", "4"]},
 }
 
-zero_cats = ['1.B.2.a.i', '1.B.2.a.ii'] # venting and flaring with 0 for oil as
+zero_cats = ["1.B.2.a.i", "1.B.2.a.ii"]  # venting and flaring with 0 for oil as
 # all mapped to natural gas

+ 122 - 88
src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -1,13 +1,23 @@
-# this script reads data from Morocco's BUR3
-# Data is read from pdf
+"""
+Read Morocco's BUR3 from pdf
 
+This script reads data from Morocco's BUR3
+Data are read from pdf using camelot
+
+"""
 import copy
 
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_mar_bur3 import (aggregate_cats, cat_mapping, header_defs, remove_cats,
-                              table_defs, zero_cats)
+from config_mar_bur3 import (
+    aggregate_cats,
+    cat_mapping,
+    header_defs,
+    remove_cats,
+    table_defs,
+    zero_cats,
+)
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -16,11 +26,11 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
-    output_filename = 'MAR_BUR3_2022_'
-    inventory_file = 'Morocco_BUR3_Fr.pdf'
-    gwp_to_use = 'AR4GWP100'
+    input_folder = downloaded_data_path / "UNFCCC" / "Morocco" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Morocco"
+    output_filename = "MAR_BUR3_2022_"
+    inventory_file = "Morocco_BUR3_Fr.pdf"
+    gwp_to_use = "AR4GWP100"
 
     # years to read
     years = [2010, 2012, 2014, 2016, 2018]
@@ -31,30 +41,28 @@ if __name__ == "__main__":
     # special header as category code and name in one column
     header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
-    index_cols = ['Catégories']
+    index_cols = ["Catégories"]
 
     # rows to remove
-    cats_remove = [
-        'Agriculture' # always empty
-    ]
+    cats_remove = ["Agriculture"]  # always empty
 
     # manual category codes
     cat_codes_manual = {
-        '1.A.2.e -Industries agro-alimentaires et du tabac': '1.A.2.e',
-        '1.A.2.f -Industries des minéraux non- métalliques': '1.A.2.f',
+        "1.A.2.e -Industries agro-alimentaires et du tabac": "1.A.2.e",
+        "1.A.2.f -Industries des minéraux non- métalliques": "1.A.2.f",
         #'Agriculture': 'M.AG',
-        '2. PIUP': '2',
-        'UTCATF': 'M.LULUCF',
-        '3.B.1 Terres forestières': 'LU.3.B.1',
-        '3.B.2 Terres cultivées': 'LU.3.B.2',
-        '3.B.3 Prairies': 'LU.3.B.3',
-        '3.B.4 Terres humides': 'LU.3.B.4',
-        '3.B.5 Etablissements': 'LU.3.B.5',
-        '3.B.6 Autres terres': 'LU.3.B.6',
-        '1.B.1.a.i.1 -Exploitation minière': '1.A.1.a.i.1',
+        "2. PIUP": "2",
+        "UTCATF": "M.LULUCF",
+        "3.B.1 Terres forestières": "LU.3.B.1",
+        "3.B.2 Terres cultivées": "LU.3.B.2",
+        "3.B.3 Prairies": "LU.3.B.3",
+        "3.B.4 Terres humides": "LU.3.B.4",
+        "3.B.5 Etablissements": "LU.3.B.5",
+        "3.B.6 Autres terres": "LU.3.B.6",
+        "1.B.1.a.i.1 -Exploitation minière": "1.A.1.a.i.1",
     }
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,14})\s-\s.*"
 
     coords_terminologies = {
         "area": "ISO3",
@@ -66,32 +74,29 @@ if __name__ == "__main__":
         "source": "MAR-GHG-inventory ",
         "provenance": "measured",
         "area": "MAR",
-        "scenario": "BUR3"
+        "scenario": "BUR3",
     }
 
     coords_value_mapping = {
         "unit": "PRIMAP1",
         "entity": {
-            'HFCs (AR4GWP100)': 'HFCS (AR4GWP100)',
-            'PFCs (AR4GWP100)': 'PFCS (AR4GWP100)',
-            'COVNM': 'NMVOC',
-        }
+            "HFCs (AR4GWP100)": "HFCS (AR4GWP100)",
+            "PFCs (AR4GWP100)": "PFCS (AR4GWP100)",
+            "COVNM": "NMVOC",
+        },
     }
 
+    coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
 
-    coords_cols = {
-        "category": "category",
-        "entity": "entity",
-        "unit": "unit"
-    }
-
-    #add_coords_cols = {
+    # add_coords_cols = {
     #    "orig_cat_name": ["orig_cat_name", "category"],
-    #}
+    # }
 
     filter_remove = {
         "f1": {
-            "entity": ['Other halogenated gases without CO2 equivalent conversion factors (2)'],
+            "entity": [
+                "Other halogenated gases without CO2 equivalent conversion factors (2)"
+            ],
         },
     }
 
@@ -107,8 +112,9 @@ if __name__ == "__main__":
     ##### read the raw data from pdf #####
     tables = camelot.read_pdf(
         str(input_folder / inventory_file),
-        pages=','.join([str(page) for page in pages_to_read]),
-        flavor='lattice')
+        pages=",".join([str(page) for page in pages_to_read]),
+        flavor="lattice",
+    )
 
     ##### combine tables and convert to long format #####
     df_all = None
@@ -120,8 +126,9 @@ if __name__ == "__main__":
             df_first = tables[sector_tables[0]].df
             if len(sector_tables) > 1:
                 for table in sector_tables[1:]:
-                    df_this_table = pd.concat([df_first, tables[table].df], axis=0,
-                                              join='outer')
+                    df_this_table = pd.concat(
+                        [df_first, tables[table].df], axis=0, join="outer"
+                    )
             else:
                 df_this_table = df_first
 
@@ -130,11 +137,11 @@ if __name__ == "__main__":
             df_this_table.columns = header_defs[sector]
 
             # fix 2018 agri table
-            if (year == 2018) & (sector == "Agriculture"):
+            if (year == 2018) & (sector == "Agriculture"):  # noqa: PLR2004
                 last_shift_row = 25
-                df_temp = df_this_table.iloc[0: last_shift_row, 1:].copy()
-                df_this_table.iloc[0, 1:] = ''
-                df_this_table.iloc[1: last_shift_row + 1, 1:] = df_temp
+                df_temp = df_this_table.iloc[0:last_shift_row, 1:].copy()
+                df_this_table.iloc[0, 1:] = ""
+                df_this_table.iloc[1 : last_shift_row + 1, 1:] = df_temp
 
             # replace line breaks, long hyphens, double, and triple spaces in category names
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
@@ -144,14 +151,15 @@ if __name__ == "__main__":
 
             # set index and convert to long format
             df_this_table = df_this_table.set_index(index_cols)
-            df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year,
-                                                                  header_long)
+            df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+                df_this_table, year, header_long
+            )
 
             # print(df_this_table_long.head())
             if df_all is None:
                 df_all = df_this_table_long
             else:
-                df_all = pd.concat([df_all, df_this_table_long], axis=0, join='outer')
+                df_all = pd.concat([df_all, df_this_table_long], axis=0, join="outer")
 
     df_all = df_all.reset_index(drop=True)
 
@@ -166,24 +174,32 @@ if __name__ == "__main__":
     # replace cat names by codes in col "category"
     # first the manual replacements
     df_all["category"] = df_all["category"].replace(cat_codes_manual)
+
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_all["category"] = df_all["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_all["category"] = df_all["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_all = df_all.reset_index(drop=True)
 
     # prepare numbers for pd.to_numeric
-    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(' ', '')
-    def repl(m):
-        return m.group('part1') + '.' + m.group('part2')
-    df_all.loc[:, 'data'] = df_all.loc[:, 'data'].str.replace(
-        '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-    df_all['data'][df_all['data'].isnull()] = 'NaN'
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(" ", "")
+
+    def repl(m):  # noqa: D103
+        return m.group("part1") + "." + m.group("part2")
+
+    df_all.loc[:, "data"] = df_all.loc[:, "data"].str.replace(
+        "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+    )
+    df_all["data"][df_all["data"].isna()] = "NaN"
 
     # add GWP information to entity
     for entity in df_all["entity"].unique():
-        df_all["entity"][(df_all["entity"] == entity) & (
-                    df_all["unit"] == "GgCO2eq")] = f"{entity} ({gwp_to_use})"
+        df_all["entity"][
+            (df_all["entity"] == entity) & (df_all["unit"] == "GgCO2eq")
+        ] = f"{entity} ({gwp_to_use})"
 
     # drop "original_cat_name" as it has non-unique values per category
     df_all = df_all.drop(columns="orig_cat_name")
@@ -196,7 +212,8 @@ if __name__ == "__main__":
         coords_value_mapping=coords_value_mapping,
         filter_remove=filter_remove,
         meta_data=meta_data,
-        convert_str=True
+        convert_str=True,
+        time_format="%Y",
     )
 
     # make sure all col headers are str
@@ -205,7 +222,9 @@ if __name__ == "__main__":
     # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
 
-    entities_to_convert = ['CO2'] #['N2O', 'SF6', 'CO2', 'CH4'] # CO2 is not converted on
+    entities_to_convert = [
+        "CO2"
+    ]  # ['N2O', 'SF6', 'CO2', 'CH4'] # CO2 is not converted on
     # conversion to IF as data with and without GWP exists. needs to be fixed in primap2
     entities_to_convert = [f"{entity} (AR4GWP100)" for entity in entities_to_convert]
 
@@ -230,38 +249,42 @@ if __name__ == "__main__":
     data_if_2006.attrs = copy.deepcopy(data_if.attrs)
 
     filter_remove_cats = {
-        "cat": {
-            f"category ({coords_terminologies['category']})":
-        remove_cats
-        },
+        "cat": {f"category ({coords_terminologies['category']})": remove_cats},
     }
 
     filter_data(data_if_2006, filter_remove=filter_remove_cats)
 
     # map categories
     data_if_2006 = data_if_2006.replace(
-        {f"category ({coords_terminologies['category']})": cat_mapping})
+        {f"category ({coords_terminologies['category']})": cat_mapping}
+    )
     data_if_2006[f"category ({coords_terminologies['category']})"].unique()
 
     # rename the category col
-    data_if_2006.rename(columns={
-        f"category ({coords_terminologies['category']})": 'category (IPCC2006_PRIMAP)'},
-                        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == f"category ({coords_terminologies['category']})"
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+        }
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == f"category ({coords_terminologies['category']})"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
     # aggregate categories
-    time_format = '%Y'
+    time_format = "%Y"
     time_columns = [
         col
-        for col in data_if_2006.columns.values
+        for col in data_if_2006.columns.to_numpy()
         if matches_time_format(col, time_format)
     ]
 
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
         # print(df_test)
 
@@ -273,8 +296,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -282,15 +312,16 @@ if __name__ == "__main__":
 
             df_combine = df_combine.reset_index()
 
-            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join='outer')
+            data_if_2006 = pd.concat([data_if_2006, df_combine], axis=0, join="outer")
             data_if_2006 = data_if_2006.reset_index(drop=True)
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
     for cat in zero_cats:
         entities = data_if_2006["entity"].unique()
-        data_zero = data_if_2006[data_if_2006["category (IPCC2006_PRIMAP)"]=="1"].copy(
-            deep=True)
+        data_zero = data_if_2006[
+            data_if_2006["category (IPCC2006_PRIMAP)"] == "1"
+        ].copy(deep=True)
         data_zero["category (IPCC2006_PRIMAP)"] = cat
         for col in time_columns:
             data_zero[col] = 0
@@ -303,7 +334,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
 
-
     # ###
     # save data to IF and native format
     # ###
@@ -312,17 +342,21 @@ if __name__ == "__main__":
 
     # data in original categories
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
         output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-        encoding=encoding)
+        encoding=encoding,
+    )
 
     # data in 2006 categories
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
     data_pm2_2006.pr.to_netcdf(
-        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding)
+        output_folder / (output_filename + "IPCC2006_PRIMAP" + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Nigeria/__init__.py

@@ -0,0 +1,30 @@
+"""Read Nigeria's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'NGA'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=NGA
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 294 - 272
src/unfccc_ghg_data/unfccc_reader/Nigeria/config_nga_bur2.py

@@ -1,274 +1,280 @@
-gwp_to_use = 'AR5GWP100'
+"""Config for Nigeria's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
+gwp_to_use = "AR5GWP100"
 
 tables_trends = {
-    '70': { # GHG by main sector
-        'page': '70',
-        'area': ['177,430,450,142'],
-        'cols': ['208,260,311,355,406'],
-        'coords_defaults': {
-            'unit': 'GgCO2eq',
-        },
-        'coords_cols': {
+    "70": {  # GHG by main sector
+        "page": "70",
+        "area": ["177,430,450,142"],
+        "cols": ["208,260,311,355,406"],
+        "coords_defaults": {
+            "unit": "GgCO2eq",
+        },
+        "coords_cols": {
             "category": "Year",
             "entity": "entity",
         },
-        'copy_cols': {
+        "copy_cols": {
             # to: from
-            'entity': 'Year',
+            "entity": "Year",
         },
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
-            'entity': {
-                'Total emissions': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Energy': f'KYOTOGHG ({gwp_to_use})',
-                'IPPU': f'KYOTOGHG ({gwp_to_use})',
-                'AFOLU': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Waste': f'KYOTOGHG ({gwp_to_use})',
+            "entity": {
+                "Total emissions": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Energy": f"KYOTOGHG ({gwp_to_use})",
+                "IPPU": f"KYOTOGHG ({gwp_to_use})",
+                "AFOLU": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Waste": f"KYOTOGHG ({gwp_to_use})",
             },
         },
-        'label_rows': [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '71': { # main gases by sector
-    'page': '71',
-        'area': ['82,760,509,454'],
-        'cols': ['124,186,249,326,388,454'],
-        'coords_defaults': {
-            'category': '0',
-            'unit': 'GgCO2eq',
-        },
-        'coords_cols': {
+    "71": {  # main gases by sector
+        "page": "71",
+        "area": ["82,760,509,454"],
+        "cols": ["124,186,249,326,388,454"],
+        "coords_defaults": {
+            "category": "0",
+            "unit": "GgCO2eq",
+        },
+        "coords_cols": {
             "entity": "Year",
         },
-        'remove_cols': [],
-        'coords_value_mapping': {
+        "remove_cols": [],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'entity': {
-                'Total GHG emissions (CO₂-eq)': f'KYOTOGHG emissions ({gwp_to_use})',
-                'Removals (CO₂) (CO₂-eq)': 'CO2 removals',
-                'Net emissions (CO₂-eq)': f'KYOTOGHG ({gwp_to_use})',
-                'CO₂ (Gg)': 'CO2 emissions',
-                'CH₄ (CO₂-eq)': f'CH4 ({gwp_to_use})',
-                'N₂O (CO₂-eq)': f'N2O ({gwp_to_use})',
+            "entity": {
+                "Total GHG emissions (CO₂-eq)": f"KYOTOGHG emissions ({gwp_to_use})",
+                "Removals (CO₂) (CO₂-eq)": "CO2 removals",
+                "Net emissions (CO₂-eq)": f"KYOTOGHG ({gwp_to_use})",
+                "CO₂ (Gg)": "CO2 emissions",
+                "CH₄ (CO₂-eq)": f"CH4 ({gwp_to_use})",
+                "N₂O (CO₂-eq)": f"N2O ({gwp_to_use})",
             },
         },
-        'label_rows':  [0, 1, 2, 3, 4],
+        "label_rows": [0, 1, 2, 3, 4],
     },
-    '72_1': { # CO2 by main sector
-    'page': '72',
-        'area': ['122,760,496,472'],
-        'cols': ['159,212,265,311,355,406,456'],
-        'coords_defaults': {
+    "72_1": {  # CO2 by main sector
+        "page": "72",
+        "area": ["122,760,496,472"],
+        "cols": ["159,212,265,311,355,406,456"],
+        "coords_defaults": {
             #'entity': 'CO2',
-            'unit': 'Gg',
+            "unit": "Gg",
         },
-        'coords_cols': {
+        "coords_cols": {
             "category": "Year",
-            'entity': 'entity',
+            "entity": "entity",
         },
-        'remove_cols': ['Total emissions'],
-        'copy_cols': {
+        "remove_cols": ["Total emissions"],
+        "copy_cols": {
             # to: from
-            'entity': 'Year',
+            "entity": "Year",
         },
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total net emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU - emissions': '3',
-                'AFOLU - removals': '3',
-                'Waste': '4',
+            "category": {
+                "Total net emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU - emissions": "3",
+                "AFOLU - removals": "3",
+                "Waste": "4",
             },
-            'entity': {
-                'Total net emissions': 'CO2',
-                'Energy': 'CO2',
-                'IPPU': 'CO2',
-                'AFOLU - emissions': 'CO2 emissions',
-                'AFOLU - removals': 'CO2 removals',
-                'Waste': 'CO2',
+            "entity": {
+                "Total net emissions": "CO2",
+                "Energy": "CO2",
+                "IPPU": "CO2",
+                "AFOLU - emissions": "CO2 emissions",
+                "AFOLU - removals": "CO2 removals",
+                "Waste": "CO2",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '72_2': { # CH4 by sector
-    'page': '72',
-        'area': ['133,333,483,41'],
-        'cols': ['172,230,280,333,384,439'],
-        'coords_defaults': {
-            'entity': 'CH4',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "72_2": {  # CH4 by sector
+        "page": "72",
+        "area": ["133,333,483,41"],
+        "cols": ["172,230,280,333,384,439"],
+        "coords_defaults": {
+            "entity": "CH4",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
-        'remove_cols': ['Total (Gg CO₂-eq)'],
-        'coords_value_mapping': {
+        "remove_cols": ["Total (Gg CO₂-eq)"],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU - emissions': '3',
-                'Waste': '4',
+            "category": {
+                "Total": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU - emissions": "3",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '73': { # N2O by sector
-    'page': '73',
-        'area': ['155,666,643,364'],
-        'cols': ['194,265,309,366,419'],
-        'coords_defaults': {
-            'entity': 'N2O',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "73": {  # N2O by sector
+        "page": "73",
+        "area": ["155,666,643,364"],
+        "cols": ["194,265,309,366,419"],
+        "coords_defaults": {
+            "entity": "N2O",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
-        'remove_cols': ['Total emissions (Gg CO₂-eq)'],
-        'coords_value_mapping': {
+        "remove_cols": ["Total emissions (Gg CO₂-eq)"],
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total': '0',
-                'Energy': '1',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total": "0",
+                "Energy": "1",
+                "AFOLU": "3",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '74': { # NOx by sector
-    'page': '74',
-        'area': ['148,457,467,166'],
-        'cols': ['190,254,304,359,421'],
-        'coords_defaults': {
-            'entity': 'NOX',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "74": {  # NOx by sector
+        "page": "74",
+        "area": ["148,457,467,166"],
+        "cols": ["190,254,304,359,421"],
+        "coords_defaults": {
+            "entity": "NOX",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
         #'remove_cols': [],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '75': { # CO by sector
-    'page': '75',
-        'area': ['161,763,456,472'],
-        'cols': ['199,256,307,359,410'],
-        'coords_defaults': {
-            'entity': 'CO',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "75": {  # CO by sector
+        "page": "75",
+        "area": ["161,763,456,472"],
+        "cols": ["199,256,307,359,410"],
+        "coords_defaults": {
+            "entity": "CO",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'AFOLU': '3',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "AFOLU": "3",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '75_2': { # NMVOC by sector
-    'page': '75',
-        'area': ['177,325,441,50'],
-        'cols': ['219,287,340,395'],
-        'coords_defaults': {
-            'entity': 'NMVOC',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "75_2": {  # NMVOC by sector
+        "page": "75",
+        "area": ["177,325,441,50"],
+        "cols": ["219,287,340,395"],
+        "coords_defaults": {
+            "entity": "NMVOC",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '76_1': { # NMVOC by sector
-    'page': '76',
-        'area': ['175,782,448,675'],
-        'cols': ['216,282,340,390'],
-        'coords_defaults': {
-            'entity': 'NMVOC',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "76_1": {  # NMVOC by sector
+        "page": "76",
+        "area": ["175,782,448,675"],
+        "cols": ["216,282,340,390"],
+        "coords_defaults": {
+            "entity": "NMVOC",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'IPPU': '2',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "IPPU": "2",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0, 1, 2],
+        "label_rows": [0, 1, 2],
     },
-    '76_2': { # SO2 by sector
-    'page': '76',
-        'area': ['197,562,421,226'],
-        'cols': ['243,331,381'],
-        'coords_defaults': {
-            'entity': 'SO2',
-            'unit': 'Gg',
-        },
-        'coords_cols': {
+    "76_2": {  # SO2 by sector
+        "page": "76",
+        "area": ["197,562,421,226"],
+        "cols": ["243,331,381"],
+        "coords_defaults": {
+            "entity": "SO2",
+            "unit": "Gg",
+        },
+        "coords_cols": {
             "category": "Year",
         },
         #'remove_cols': ['Total emissions (Gg CO2-eq)'],
-        'coords_value_mapping': {
+        "coords_value_mapping": {
             "unit": "PRIMAP1",
-            'category': {
-                'Total emissions': '0',
-                'Energy': '1',
-                'Waste': '4',
+            "category": {
+                "Total emissions": "0",
+                "Energy": "1",
+                "Waste": "4",
             },
         },
-        'label_rows':  [0],
+        "label_rows": [0],
     },
 }
 
 pages_inventory = {
-    '78': 1,
-    '79': 0,
-    '80': 0,
-    '81': 0,
-    '82': 0,
+    "78": 1,
+    "79": 0,
+    "80": 0,
+    "81": 0,
+    "82": 0,
 }
 
 year_inventory = 2017
@@ -279,8 +285,8 @@ unit_row = 0
 ###
 index_cols = "Categories"
 units_inv = {
-    'Emissions (Gg)': 'Gg',
-    'Emissions CO2 Equivalents (Gg)': 'GgCO2eq',
+    "Emissions (Gg)": "Gg",
+    "Emissions CO2 Equivalents (Gg)": "GgCO2eq",
 }
 # special header as category UNFCCC_GHG_data and name in one column
 header_long = ["category", "entity", "unit", "time", "data"]
@@ -288,11 +294,11 @@ header_long = ["category", "entity", "unit", "time", "data"]
 
 # manual category codes
 cat_codes_manual = {
-    'Total National Emissions and Removals': '0',
-    'International Bunkers': 'M.BK',
+    "Total National Emissions and Removals": "0",
+    "International Bunkers": "M.BK",
 }
 
-cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,9})\s.*'
+cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,9})\s.*"
 
 coords_cols = {
     "category": "category",
@@ -321,29 +327,24 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
-        'Net CO2 (1)(2)': 'CO2',
-        'CH4': "CH4",
-        'N2O': "N2O",
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'SF6': f"SF6 ({gwp_to_use})",
+        "Net CO2 (1)(2)": "CO2",
+        "CH4": "CH4",
+        "N2O": "N2O",
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
         #'NOx': 'NOX',
-        'CO': 'CO', # no mapping, just added for completeness here
-        'NMVOCs': 'NMVOC',
-        'SO2': 'SO2', # no mapping, just added for completeness here
-        'Other halogenated gases with CO2 eq conversion factors (3)':
-            f"UnspMixOfHFCs ({gwp_to_use})",
+        "CO": "CO",  # no mapping, just added for completeness here
+        "NMVOCs": "NMVOC",
+        "SO2": "SO2",  # no mapping, just added for completeness here
+        "Other halogenated gases with CO2 eq conversion factors (3)": f"UnspMixOfHFCs ({gwp_to_use})",
     },
 }
 
 
 filter_remove = {
-    'f1': {
-        'entity': ['Other halogenated gases without CO2 eq conversion factors (4)']
-    },
-    'f2': {
-        'category': 'Memo'
-    },
+    "f1": {"entity": ["Other halogenated gases without CO2 eq conversion factors (4)"]},
+    "f2": {"category": "Memo"},
 }
 
 filter_keep = {}
@@ -353,73 +354,90 @@ meta_data = {
     "rights": "",
     "contact": "mail@johannes-guestchow.de",
     "title": "Nigeria. Second Biennial Update Report (BUR2) to the United Nations "
-             "Framework Convention on Climate Change",
+    "Framework Convention on Climate Change",
     "comment": "Read fom pdf by Johannes Gütschow",
     "institution": "UNFCCC",
 }
 
 # convert to mass units where possible
-entities_to_convert_to_mass = [
-    'CH4', 'N2O', 'SF6'
-]
+entities_to_convert_to_mass = ["CH4", "N2O", "SF6"]
 
-# CO2 equivalents don't make sense for these substances, so unit has to be Gg instead of Gg CO2 equivalents as indicated in the table
-entities_to_fix_unit = [
-    'NOx', 'CO', 'NMVOCs', 'SO2'
-]
+# CO2 equivalents don't make sense for these substances, so unit has to be Gg instead
+# of Gg CO2 equivalents as indicated in the table
+entities_to_fix_unit = ["NOx", "CO", "NMVOCs", "SO2"]
 
 ### processing
 
 processing_info_step1 = {
-    'aggregate_cats': {
-        '2.F': {'sources': ['2.F.2', '2.F.6'], # all 0, but for completeness
-              'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G'],
-              'name': 'IPPU'}, # for HFCs, PFCs, SO2, SF6, N2O (all 0)
+    "aggregate_cats": {
+        "2.F": {
+            "sources": ["2.F.2", "2.F.6"],  # all 0, but for completeness
+            "name": "Product uses as Substitutes for Ozone Depleting Substances",
+        },
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G"],
+            "name": "IPPU",
+        },  # for HFCs, PFCs, SO2, SF6, N2O (all 0)
     },
 }
 
-processing_info_step2 =  {
-    'aggregate_cats': {
-        'M.AG.ELV': {'sources': ['3.C'], 'name': 'Agriculture excluding livestock emissions'},
-        'M.AG': {'sources': ['M.AG.ELV', '3.A'], 'name': 'Agriculture'},
-        'M.LULUCF': {'sources': ['3.B', '3.D'],
-                     'name': 'Land Use, Land Use Change, and Forestry'},
-        'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National Total Excluding LULUCF'},
-        '0': {'sources': ['1', '2', '3', '4', '5'], 'name': 'National Total'},
+processing_info_step2 = {
+    "aggregate_cats": {
+        "M.AG.ELV": {
+            "sources": ["3.C"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.AG": {"sources": ["M.AG.ELV", "3.A"], "name": "Agriculture"},
+        "M.LULUCF": {
+            "sources": ["3.B", "3.D"],
+            "name": "Land Use, Land Use Change, and Forestry",
+        },
+        "M.0.EL": {
+            "sources": ["1", "2", "M.AG", "4", "5"],
+            "name": "National Total Excluding LULUCF",
+        },
+        "0": {"sources": ["1", "2", "3", "4", "5"], "name": "National Total"},
     },
-    'downscale': {
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+    "downscale": {
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2', '1.B.3'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2", "1.B.3"],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            'IPPU': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.E',
-                                    '2.F', '2.G', '2.H'],
-                'entities': ['CO2', 'N2O', 'CH4'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "IPPU": {
+                "basket": "2",
+                "basket_contents": [
+                    "2.A",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.E",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                ],
+                "entities": ["CO2", "N2O", "CH4"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3": {
+                "basket": "3",
+                "basket_contents": ["3.A", "3.B", "3.C", "3.D"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             # '3A': {
             #     'basket': '3.A',
@@ -442,17 +460,21 @@ processing_info_step2 =  {
             # },
         },
     },
-    'remove_ts': {
-        'fgases': { # unnecessary and complicates aggregation for
+    "remove_ts": {
+        "fgases": {  # unnecessary and complicates aggregation for
             # other gases
-            'category': ['5'],
-            'entities': [f'HFCS ({gwp_to_use})', f'PFCS ({gwp_to_use})', 'SF6',
-                         f'UnspMixOfHFCs ({gwp_to_use})'],
+            "category": ["5"],
+            "entities": [
+                f"HFCS ({gwp_to_use})",
+                f"PFCS ({gwp_to_use})",
+                "SF6",
+                f"UnspMixOfHFCs ({gwp_to_use})",
+            ],
         },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS", "UnspMixOfHFCs"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS", "UnspMixOfHFCs"],
+        "source_GWP": gwp_to_use,
     },
 }

+ 137 - 103
src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py

@@ -1,5 +1,10 @@
-# this script reads data from Nigeria's BUR2
-# Data is read from the pdf file
+"""
+Read nigeria's BUR2 from pdf
+
+This script reads data from Nigeria's BUR2
+Data are read from pdf using camelot
+
+"""
 
 import locale
 from copy import deepcopy
@@ -9,32 +14,32 @@ import numpy as np
 import pandas as pd
 import primap2 as pm2
 import xarray as xr
-from .config_nga_bur2 import (
-   cat_code_regexp,
-   cat_codes_manual,
-   coords_cols,
-   coords_defaults,
-   coords_terminologies,
-   coords_value_mapping,  #, add_coords_cols
-   entity_row,
-   filter_remove,
-   header_long,
-   index_cols,
-   meta_data,
-   pages_inventory,
-   processing_info_step1,
-   processing_info_step2,
-   tables_trends,
-   unit_row,
-   units_inv,
-   year_inventory,
+from config_nga_bur2 import (
+    cat_code_regexp,
+    cat_codes_manual,
+    coords_cols,
+    coords_defaults,
+    coords_terminologies,
+    coords_value_mapping,  # , add_coords_cols
+    entity_row,
+    filter_remove,
+    header_long,
+    index_cols,
+    meta_data,
+    pages_inventory,
+    processing_info_step1,
+    processing_info_step2,
+    tables_trends,
+    unit_row,
+    units_inv,
+    year_inventory,
 )
 
 from unfccc_ghg_data.helper import (
-   downloaded_data_path,
-   extracted_data_path,
-   gas_baskets,
-   process_data_for_country,
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
 )
 
 if __name__ == "__main__":
@@ -42,61 +47,74 @@ if __name__ == "__main__":
     # configuration
     # ###
     # define locale to use for str to float conversion
-    locale_to_use = 'en_NG.UTF-8'
+    locale_to_use = "en_NG.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Nigeria' / 'BUR2'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Nigeria'
+    input_folder = downloaded_data_path / "UNFCCC" / "Nigeria" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Nigeria"
     if not output_folder.exists():
-       output_folder.mkdir()
+        output_folder.mkdir()
 
-    output_filename = 'NGA_BUR2_2021_'
+    output_filename = "NGA_BUR2_2021_"
     compression = dict(zlib=True, complevel=9)
-    inventory_file = 'NIGERIA_BUR_2_-_Second_Biennial_Update_Report_%28BUR2%29.pdf'
+    inventory_file = "NIGERIA_BUR_2_-_Second_Biennial_Update_Report_%28BUR2%29.pdf"
 
     ## read 2019 inventory
     df_inventory = None
     for page in pages_inventory.keys():
-        tables = camelot.read_pdf(str(input_folder / inventory_file), pages=str(page),
-                                  flavor='lattice')
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file), pages=str(page), flavor="lattice"
+        )
         df_this_table = tables[pages_inventory[page]].df
         # replace line breaks, double, and triple spaces in category names
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
         df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
         # replace line breaks in units and entities
-        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace('\n',
-                                                                                    '')
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace('\n', '')
+        df_this_table.iloc[entity_row] = df_this_table.iloc[entity_row].str.replace(
+            "\n", ""
+        )
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].str.replace(
+            "\n", ""
+        )
 
         # fillna in unit row
-        df_this_table.iloc[unit_row][df_this_table.iloc[unit_row]==""] = np.nan
-        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].fillna(
-            method='ffill')
-        df_this_table = pm2.pm2io.nir_add_unit_information(df_this_table, unit_row=unit_row,
-                                                           entity_row=entity_row,
-                                                           regexp_entity=".*",
-                                                           manual_repl_unit=units_inv,
-                                                           default_unit="")
+        df_this_table.iloc[unit_row][df_this_table.iloc[unit_row] == ""] = np.nan
+        df_this_table.iloc[unit_row] = df_this_table.iloc[unit_row].ffill()
+        df_this_table = pm2.pm2io.nir_add_unit_information(
+            df_this_table,
+            unit_row=unit_row,
+            entity_row=entity_row,
+            regexp_entity=".*",
+            manual_repl_unit=units_inv,
+            default_unit="",
+        )
 
         # set index and convert to long format
         df_this_table = df_this_table.set_index(index_cols)
-        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(df_this_table, year_inventory,
-                                                              header_long)
+        df_this_table_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_table, year_inventory, header_long
+        )
 
         # combine with tables for other sectors (merge not append)
         if df_inventory is None:
             df_inventory = df_this_table_long
         else:
-            df_inventory = pd.concat([df_inventory, df_this_table_long], axis=0, join='outer')
+            df_inventory = pd.concat(
+                [df_inventory, df_this_table_long], axis=0, join="outer"
+            )
 
     # replace cat names by codes in col "category"
     # first the manual replacements
     df_inventory["category"] = df_inventory["category"].replace(cat_codes_manual)
+
     # then the regex replacements
-    def repl(m):
-       return m.group('code')
-    df_inventory["category"] = df_inventory["category"].str.replace(cat_code_regexp, repl, regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory["category"] = df_inventory["category"].str.replace(
+        cat_code_regexp, repl, regex=True
+    )
     df_inventory = df_inventory.reset_index(drop=True)
 
     # ###
@@ -105,15 +123,15 @@ if __name__ == "__main__":
     data_inv_if = pm2.pm2io.convert_long_dataframe_if(
         df_inventory,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
         filter_remove=filter_remove,
         meta_data=meta_data,
         convert_str=True,
-        time_format='%Y',
-        )
+        time_format="%Y",
+    )
 
     data_inv_pm2 = pm2.pm2io.from_interchange_format(data_inv_if)
 
@@ -122,19 +140,21 @@ if __name__ == "__main__":
     for table in tables_trends.keys():
         print(table)
         current_table = deepcopy(tables_trends[table])
-        tables = camelot.read_pdf(str(input_folder / inventory_file),
-                                  pages=current_table["page"],
-                                  table_areas=current_table["area"],
-                                  columns=current_table["cols"],
-                                  flavor='stream',
-                                  split_text=True)
+        tables = camelot.read_pdf(
+            str(input_folder / inventory_file),
+            pages=current_table["page"],
+            table_areas=current_table["area"],
+            columns=current_table["cols"],
+            flavor="stream",
+            split_text=True,
+        )
         df_this_table = tables[0].df
 
         # merge rows for entity and unit
         rows_to_merge = df_this_table.iloc[current_table["label_rows"]]
         indices_to_merge = rows_to_merge.index
         # join the three rows
-        new_row = rows_to_merge.agg(' '.join)
+        new_row = rows_to_merge.agg(" ".join)
         df_this_table.loc[indices_to_merge[0]] = new_row
         df_this_table = df_this_table.drop(indices_to_merge)
         new_row = new_row.str.replace("  ", " ")
@@ -144,7 +164,7 @@ if __name__ == "__main__":
         df_this_table.columns = new_row
 
         # remove columns not needed
-        if 'remove_cols' in current_table.keys():
+        if "remove_cols" in current_table.keys():
             df_this_table = df_this_table.drop(columns=current_table["remove_cols"])
 
         df_this_table = df_this_table.set_index("Year")
@@ -155,12 +175,14 @@ if __name__ == "__main__":
         # remove "," (thousand sep) from data
         for col in df_this_table.columns:
             df_this_table.loc[:, col] = df_this_table.loc[:, col].str.strip()
-            def repl(m):
-               return m.group('part1') + m.group('part2')
-            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
-                '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-            df_this_table[col][df_this_table[col].isnull()] = 'NaN'
 
+            def repl(m):  # noqa: D103
+                return m.group("part1") + m.group("part2")
+
+            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
+                "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+            )
+            df_this_table[col][df_this_table[col].isna()] = "NaN"
 
         # metadta in forst col instread of index
         df_this_table = df_this_table.reset_index()
@@ -170,7 +192,7 @@ if __name__ == "__main__":
         df_this_table.columns = df_this_table.columns.map(str)
 
         # make copy of columns if a column is used twice for metadata
-        if 'copy_cols' in current_table.keys():
+        if "copy_cols" in current_table.keys():
             for col in current_table["copy_cols"]:
                 df_this_table[col] = df_this_table[current_table["copy_cols"][col]]
 
@@ -184,7 +206,7 @@ if __name__ == "__main__":
             coords_value_mapping=current_table["coords_value_mapping"],
             meta_data=meta_data,
             convert_str=True,
-            time_format='%Y',
+            time_format="%Y",
         )
 
         data_current_pm2 = pm2.pm2io.from_interchange_format(data_current_if)
@@ -193,7 +215,7 @@ if __name__ == "__main__":
         else:
             data_trend_pm2 = data_trend_pm2.pr.merge(data_current_pm2)
 
-    data_pm2 = data_inv_pm2.pr.merge(data_trend_pm2, tolerance=0.02) # some rounding in
+    data_pm2 = data_inv_pm2.pr.merge(data_trend_pm2, tolerance=0.02)  # some rounding in
     # trends needs higher tolerance
 
     data_if = data_pm2.pr.to_interchange_format()
@@ -205,48 +227,59 @@ if __name__ == "__main__":
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_if)
+        data_if,
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] +
-                         "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     #### processing
     data_proc_pm2 = data_pm2
     terminology_proc = coords_terminologies["category"]
 
     # combine CO2 emissions and removals
-    temp_CO2 = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
+    temp_CO2 = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
     data_proc_pm2["CO2"] = data_proc_pm2["CO2"].fillna(temp_CO2)
 
     # create net KYOTOGHG for 0 and 3
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] \
-        = xr.full_like(data_proc_pm2["CO2 removals"],
-                       np.nan).pr.quantify(units="Gg CO2 / year")
-
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"].attrs = {"entity": "KYOTOGHG",
-                                                            "gwp_context": "AR5GWP100"}
-    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] \
-        = data_proc_pm2.pr.gas_basket_contents_sum(
-        basket="KYOTOGHG removals (AR5GWP100)", basket_contents=['CO2 removals'],
-        skipna=True, min_count=1)
-    temp_KYOTOGHG = data_proc_pm2[["KYOTOGHG emissions (AR5GWP100)",
-                                   "KYOTOGHG removals (AR5GWP100)"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["KYOTOGHG (AR5GWP100)"] \
-        = data_proc_pm2["KYOTOGHG (AR5GWP100)"].fillna(temp_KYOTOGHG)
-
+    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"] = xr.full_like(
+        data_proc_pm2["CO2 removals"], np.nan
+    ).pr.quantify(units="Gg CO2 / year")
+
+    data_proc_pm2["KYOTOGHG removals (AR5GWP100)"].attrs = {
+        "entity": "KYOTOGHG",
+        "gwp_context": "AR5GWP100",
+    }
+    data_proc_pm2[
+        "KYOTOGHG removals (AR5GWP100)"
+    ] = data_proc_pm2.pr.gas_basket_contents_sum(
+        basket="KYOTOGHG removals (AR5GWP100)",
+        basket_contents=["CO2 removals"],
+        skipna=True,
+        min_count=1,
+    )
+    temp_KYOTOGHG = data_proc_pm2[
+        ["KYOTOGHG emissions (AR5GWP100)", "KYOTOGHG removals (AR5GWP100)"]
+    ].pr.sum(dim="entity", skipna=True, min_count=1)
+    data_proc_pm2["KYOTOGHG (AR5GWP100)"] = data_proc_pm2[
+        "KYOTOGHG (AR5GWP100)"
+    ].fillna(temp_KYOTOGHG)
 
     # actual processing
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals',
-                            'KYOTOGHG emissions (AR5GWP100)',
-                            'KYOTOGHG removals (AR5GWP100)'],
+        entities_to_ignore=[
+            "CO2 emissions",
+            "CO2 removals",
+            "KYOTOGHG emissions (AR5GWP100)",
+            "KYOTOGHG removals (AR5GWP100)",
+        ],
         gas_baskets={},
         processing_info_country=processing_info_step1,
     )
@@ -256,16 +289,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         processing_info_country=processing_info_step2,
-        cat_terminology_out = terminology_proc,
-        #category_conversion = None,
-        #sectors_out = None,
+        cat_terminology_out=terminology_proc,
+        # category_conversion = None,
+        # sectors_out = None,
     )
 
     # adapt source and metadata
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -274,9 +307,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Peru/__init__.py

@@ -0,0 +1,30 @@
+"""Read Peru's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'PER'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=PER
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 77 - 66
src/unfccc_ghg_data/unfccc_reader/Peru/config_per_bur3.py

@@ -1,3 +1,9 @@
+"""Config for Peru's BUR3
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 table_def_templates = {
     "300": {  # 300
         "area": ["69,457,727,78"],
@@ -486,75 +492,80 @@ meta_data = {
 
 ## processing
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.A.5': '1.A.5',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '2': '2',
-        '2.A': '2.A',
-        '2.B': '2.B',
-        '2.C': '2.C',
-        '2.D': '2.D',
-        '2.E': '2.E',
-        '2.F': '2.F',
-        '2.G': '2.G',
-        '2.H': '2.H',
-        '3': 'M.AG',
-        '3.A': '3.A',
-        '3.A.1': '3.A.1',
-        '3.A.2': '3.A.2',
-        '3.C': '3.C',
-        '3.C.1': '3.C.1',
-        '3.C.2': '3.C.2',
-        '3.C.3': '3.C.3',
-        '3.C.4': '3.C.4',
-        '3.C.5': '3.C.5',
-        '3.C.6': '3.C.6',
-        '3.C.7': '3.C.7',
-        '4': 'M.LULUCF',
-        'M.2006.3.B': '3.B',
-        '4.A': '3.B.1',
-        '4.B': '3.B.2',
-        '4.C': '3.B.3',
-        '4.D': '3.B.4',
-        '4.E': '3.B.5',
-        '4.F': '3.B.6',
-        '4.G': '3.D.1',
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        'M.BK': 'M.BK',
-        'M.BK.A': 'M.BK.A',
-        'M.BK.M': 'M.BM.M',
-        'M.BIO': 'M.BIO',
-    },
-    'aggregate': {
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-              'name': 'IPPU'},
-        'M.3.C.AG': {
-            'sources': ['3.C'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        '3.D': {'sources': ['3.D.1'], 'name': 'Other'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "mapping": {
+        "0": "0",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.A.5": "1.A.5",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "2": "2",
+        "2.A": "2.A",
+        "2.B": "2.B",
+        "2.C": "2.C",
+        "2.D": "2.D",
+        "2.E": "2.E",
+        "2.F": "2.F",
+        "2.G": "2.G",
+        "2.H": "2.H",
+        "3": "M.AG",
+        "3.A": "3.A",
+        "3.A.1": "3.A.1",
+        "3.A.2": "3.A.2",
+        "3.C": "3.C",
+        "3.C.1": "3.C.1",
+        "3.C.2": "3.C.2",
+        "3.C.3": "3.C.3",
+        "3.C.4": "3.C.4",
+        "3.C.5": "3.C.5",
+        "3.C.6": "3.C.6",
+        "3.C.7": "3.C.7",
+        "4": "M.LULUCF",
+        "M.2006.3.B": "3.B",
+        "4.A": "3.B.1",
+        "4.B": "3.B.2",
+        "4.C": "3.B.3",
+        "4.D": "3.B.4",
+        "4.E": "3.B.5",
+        "4.F": "3.B.6",
+        "4.G": "3.D.1",
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "M.BK": "M.BK",
+        "M.BK.A": "M.BK.A",
+        "M.BK.M": "M.BM.M",
+        "M.BIO": "M.BIO",
+    },
+    "aggregate": {
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "name": "IPPU",
+        },
+        "M.3.C.AG": {
+            "sources": ["3.C"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "3.D": {"sources": ["3.D.1"], "name": "Other"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
 }
 
 processing_info = {
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR4GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR4GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
 }

+ 33 - 20
src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py

@@ -1,12 +1,17 @@
-# read Singapore fifth BUR from pdf
+"""
+Read Peru's BUR3 from pdf
 
+This script reads data from Peru's BUR3
+Data are read from pdf using camelot
+
+"""
 
 import locale
 
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_per_bur3 import (
+from config_per_bur3 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_conversion,
@@ -103,20 +108,22 @@ if __name__ == "__main__":
 
             # drop cols if necessary
             if "drop_cols" in table_defs[page].keys():
-                # print(df_current.columns.values)
+                # print(df_current.columns.to_numpy())
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
             elif "drop_cols" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
 
             # rename category column
-            df_current.rename(
-                columns={table_defs[page]["category_col"]: index_cols[0]}, inplace=True
+            df_current = df_current.rename(
+                columns={table_defs[page]["category_col"]: index_cols[0]}
             )
 
             # replace double \n
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             # replace double and triple spaces
-            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace(
+                "   ", " "
+            )
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
             # fix the split rows
@@ -137,7 +144,7 @@ if __name__ == "__main__":
             # set index
             # df_current = df_current.set_index(index_cols)
             # strip trailing and leading  and remove "^"
-            for col in df_current.columns.values:
+            for col in df_current.columns.to_numpy():
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.replace("^", "")
 
@@ -147,9 +154,9 @@ if __name__ == "__main__":
                 df_this_page = df_current.copy(deep=True)
             else:
                 # find intersecting cols
-                cols_this_page = df_this_page.columns.values
+                cols_this_page = df_this_page.columns.to_numpy()
                 # print(f"cols this page: {cols_this_page}")
-                cols_current = df_current.columns.values
+                cols_current = df_current.columns.to_numpy()
                 # print(f"cols current: {cols_current}")
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 # print(f"cols both: {cols_both}")
@@ -179,7 +186,9 @@ if __name__ == "__main__":
         # drop the rows with memo items etc
         for cat in cats_remove:
             df_this_page_long = df_this_page_long.drop(
-                df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index
+                df_this_page_long.loc[
+                    df_this_page_long.loc[:, index_cols[0]] == cat
+                ].index
             )
 
         # make a copy of the categories row
@@ -187,12 +196,14 @@ if __name__ == "__main__":
 
         # replace cat names by codes in col "Categories"
         # first the manual replacements
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
-            cat_codes_manual
-        )
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].replace(cat_codes_manual)
+
         # then the regex replacements
-        def repl(m):
+        def repl(m):  # noqa: D103
             return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
             :, "category"
         ].str.replace(cat_code_regexp, repl, regex=True)
@@ -211,8 +222,10 @@ if __name__ == "__main__":
             ".", ""
         )
         pat = r"^(?P<first>[0-9\.,]*),(?P<last>[0-9\.,]*)$"
-        def repl(m):
+
+        def repl(m):  # noqa: D103
             return f"{m.group('first')}.{m.group('last')}"
+
         df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
             pat, repl, regex=True
         )
@@ -265,12 +278,11 @@ if __name__ == "__main__":
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
         encoding=encoding,
     )
 
-    #### continue here
-
     # ###
     # ## process the data
     # ###
@@ -288,7 +300,7 @@ if __name__ == "__main__":
     )
 
     # adapt source and metadata
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
     data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
@@ -305,6 +317,7 @@ if __name__ == "__main__":
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
         encoding=encoding,
     )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/__init__.py

@@ -0,0 +1,30 @@
+"""Read South Korea's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'KOR'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=KOR
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 511 - 403
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/config_kor_bur4.py

@@ -1,413 +1,513 @@
+"""Config for South Korea's 2021 and 2022 inventories and BUR4
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 original_names = [
-    '총배출량',
-    '순배출량',
-    '에너지',
-    'A. 연료연소',
-    '1. 에너지산업',
-    'a. 공공전기 및 열 생산',
-    'b. 석유정제',
-    'c. 고체연료 제조 및 기타 에너지 산업',
-    '2. 제조업 및 건설업',
-    'a. 철강',
-    'b. 비철금속',
-    'c. 화학',
-    'd. 펄프, 제지 및 인쇄',
-    'e. 식음료품 가공 및 담배 제조',
-    'f. 기타',
-    '  1. 비금속',
-    '  2. 조립금속',
-    '  3. 나무 및 목재',
-    '  4. 건설',
-    '  5. 섬유 및 가죽',
-    '  6. 기타제조',
-    '3. 수송',
-    'a. 민간항공',
-    'b. 도로수송',
-    'c. 철도',
-    'd. 해운',
-    'e. 기타수송',
-    '4. 기타',
-    'a. 상업/공공',
-    'b. 가정',
-    'c. 농업/임업/어업',
-    '5. 미분류',
-    'B. 탈루',
-    '1. 고체연료',
-    '2.  석유 및 천연가스',
-    'a.  석유',
-    'b. 천연가스',
-    '산업공정',
-    'A. 광물산업',
-    '1. 시멘트생산',
-    '2. 석회생산',
-    '3. 석회석 및 백운석 소비',
-    '4. 소다회 생산 및 소비',
-    '5. 아스팔트 루핑',
-    '6. 아스팔트 도로포장',
-    'B. 화학산업',
-    'C. 금속산업',
-    '1. 철강생산',
-    '2. 합금철 생산',
-    '3. 알루미늄 생산',
-    '4. 마그네슘 생산의 SF6 소비',
-    'D. 기타산업',
-    'E. 할로카본 및 육불화황 생산',
-    '1. 부산물 배출',
-    '2. 탈루 배출',
-    'F. 할로카본 및 육불화황 소비',
-    '1.  냉장 및 냉방',
-    '2.  발포제',
-    '3.  소화기',
-    '4.  에어로졸',
-    '5.  용매',
-    '6.  기타 용도의 ODS 대체물질 사용',
-    '7.  반도체 제조',
-    '8.  중전기기',
-    '9.  기타(잠재배출량)',
-    '농업',
-    'A.  장내발효',
-    '1. 소',
-    '2. 물소',
-    '3. 양(면양)',
-    '4. 양(산양)',
-    '5. 낙타 및 라마',
-    '6. 말',
-    '7. 노새 및 당나귀',
-    '8. 돼지',
-    '9. 가금류',
-    '10. 기타 가축(사슴)',
-    'B.  가축분뇨처리',
-    '1. 소',
-    '2. 물소',
-    '3. 양(면양)',
-    '4. 양(산양)',
-    '5. 낙타 및 라마',
-    '6. 말',
-    '7. 노새 및 당나귀',
-    '8. 돼지',
-    '9. 가금류',
-    '10. 기타 가축(사슴)',
-    'C.  벼재배',
-    '1. 관개',
-    '2. 천수답',
-    'D. 농경지토양',
-    '1. 직접배출',
-    '2. 목장, 방목구역, 분료(거름)',
-    '3. 간접배출',
-    'E. 사바나 소각',
-    'F. 작물잔사소각',
-    '1. 곡류',
-    '2. 두류(콩)',
-    '3. 근채류',
-    '4. 사탕수수',
-    '5. 기타',
-    'LULUCF',
-    'A. 산림지',
-    '1. 산림지로 유지된 산림지',
-    '2. 타토지에서 전용된 산림지',
-    '3. 산림지에서 질소 시비로 인한 N2O 배출',
-    '4. 산림지에서 배수로 인한 Non-CO2 배출',
-    '5. 산림지에서 바이오매스 연소에 의한 배출',
-    'B. 농경지',
-    '1. 농경지로 유지된 농경지',
-    '2. 타토지에서 전용된 농경지',
-    '3. 농경지로의 전용에 따른 N2O 배출',
-    '4. 농경지에서 농업용 석회시용으로 인한 CO2 배출',
-    '5. 농경지에서 바이오매스 연소에 의한 배출',
-    'C. 초지',
-    '1. 초지로 유지된 초지',
-    '2. 타토지에서 전용된 초지',
-    '3. 초지에서 농업용 석회시용으로 인한 CO2 배출',
-    '4. 초지에서 바이오매스 연소에 의한 배출',
-    'D. 습지',
-    '1. 습지로 유지된 습지',
-    '2. 타토지에서 전용된 습지',
-    '3. 습지에서 배수로 인한 Non-CO2 배출',
-    '4. 습지에서 바이오매스 연소에 의한 배출',
-    'E. 정주지',
-    'F. 기타토지',
-    '폐기물',
-    'A. 폐기물매립',
-    '1. 관리형 매립',
-    '2. 비관리형 매립',
-    'B. 하폐수처리',
-    '1. 폐수처리',
-    '2. 하수처리',
-    'C. 폐기물소각',
-    'D. 기타',
-    '별도항목(Memo Item)',
-    '분야·부문/연도',
-    'C. 국제벙커링 및 다국적 작전',
-    '1. 벙커링',
-    'a. 국제 항공',
-    'b. 국제 해운',
-    '2. 다국적 작전',
-    '* 참고 : NO = 배출활동 및 공정이 없는 경우, NE = 산정하지 아니하는 경우, NA = 자연적, 이론적으로 발생하지 않는 활동 및 공정의 경우, IE = 다른 항목에 포함하여 보고하는 경우, C = 기밀정보인 경우',
-    '3. 타토지로 전용된 농경지', # start of new codes in 2021 inventory
-    '4. 농경지로의 전용에 따른 N2O 배출',
-    '5. 농경지에서 농업용 석회시용으로 인한 CO2 배출',
-    '6. 농경지에서 바이오매스 연소에 의한 배출',
-    'G. 기타',
+    "총배출량",
+    "순배출량",
+    "에너지",
+    "A. 연료연소",
+    "1. 에너지산업",
+    "a. 공공전기 및 열 생산",
+    "b. 석유정제",
+    "c. 고체연료 제조 및 기타 에너지 산업",
+    "2. 제조업 및 건설업",
+    "a. 철강",
+    "b. 비철금속",
+    "c. 화학",
+    "d. 펄프, 제지 및 인쇄",
+    "e. 식음료품 가공 및 담배 제조",
+    "f. 기타",
+    "  1. 비금속",
+    "  2. 조립금속",
+    "  3. 나무 및 목재",
+    "  4. 건설",
+    "  5. 섬유 및 가죽",
+    "  6. 기타제조",
+    "3. 수송",
+    "a. 민간항공",
+    "b. 도로수송",
+    "c. 철도",
+    "d. 해운",
+    "e. 기타수송",
+    "4. 기타",
+    "a. 상업/공공",
+    "b. 가정",
+    "c. 농업/임업/어업",
+    "5. 미분류",
+    "B. 탈루",
+    "1. 고체연료",
+    "2.  석유 및 천연가스",
+    "a.  석유",
+    "b. 천연가스",
+    "산업공정",
+    "A. 광물산업",
+    "1. 시멘트생산",
+    "2. 석회생산",
+    "3. 석회석 및 백운석 소비",
+    "4. 소다회 생산 및 소비",
+    "5. 아스팔트 루핑",
+    "6. 아스팔트 도로포장",
+    "B. 화학산업",
+    "C. 금속산업",
+    "1. 철강생산",
+    "2. 합금철 생산",
+    "3. 알루미늄 생산",
+    "4. 마그네슘 생산의 SF6 소비",
+    "D. 기타산업",
+    "E. 할로카본 및 육불화황 생산",
+    "1. 부산물 배출",
+    "2. 탈루 배출",
+    "F. 할로카본 및 육불화황 소비",
+    "1.  냉장 및 냉방",
+    "2.  발포제",
+    "3.  소화기",
+    "4.  에어로졸",
+    "5.  용매",
+    "6.  기타 용도의 ODS 대체물질 사용",
+    "7.  반도체 제조",
+    "8.  중전기기",
+    "9.  기타(잠재배출량)",
+    "농업",
+    "A.  장내발효",
+    "1. 소",
+    "2. 물소",
+    "3. 양(면양)",
+    "4. 양(산양)",
+    "5. 낙타 및 라마",
+    "6. 말",
+    "7. 노새 및 당나귀",
+    "8. 돼지",
+    "9. 가금류",
+    "10. 기타 가축(사슴)",
+    "B.  가축분뇨처리",
+    "1. 소",
+    "2. 물소",
+    "3. 양(면양)",
+    "4. 양(산양)",
+    "5. 낙타 및 라마",
+    "6. 말",
+    "7. 노새 및 당나귀",
+    "8. 돼지",
+    "9. 가금류",
+    "10. 기타 가축(사슴)",
+    "C.  벼재배",
+    "1. 관개",
+    "2. 천수답",
+    "D. 농경지토양",
+    "1. 직접배출",
+    "2. 목장, 방목구역, 분료(거름)",
+    "3. 간접배출",
+    "E. 사바나 소각",
+    "F. 작물잔사소각",
+    "1. 곡류",
+    "2. 두류(콩)",
+    "3. 근채류",
+    "4. 사탕수수",
+    "5. 기타",
+    "LULUCF",
+    "A. 산림지",
+    "1. 산림지로 유지된 산림지",
+    "2. 타토지에서 전용된 산림지",
+    "3. 산림지에서 질소 시비로 인한 N2O 배출",
+    "4. 산림지에서 배수로 인한 Non-CO2 배출",
+    "5. 산림지에서 바이오매스 연소에 의한 배출",
+    "B. 농경지",
+    "1. 농경지로 유지된 농경지",
+    "2. 타토지에서 전용된 농경지",
+    "3. 농경지로의 전용에 따른 N2O 배출",
+    "4. 농경지에서 농업용 석회시용으로 인한 CO2 배출",
+    "5. 농경지에서 바이오매스 연소에 의한 배출",
+    "C. 초지",
+    "1. 초지로 유지된 초지",
+    "2. 타토지에서 전용된 초지",
+    "3. 초지에서 농업용 석회시용으로 인한 CO2 배출",
+    "4. 초지에서 바이오매스 연소에 의한 배출",
+    "D. 습지",
+    "1. 습지로 유지된 습지",
+    "2. 타토지에서 전용된 습지",
+    "3. 습지에서 배수로 인한 Non-CO2 배출",
+    "4. 습지에서 바이오매스 연소에 의한 배출",
+    "E. 정주지",
+    "F. 기타토지",
+    "폐기물",
+    "A. 폐기물매립",
+    "1. 관리형 매립",
+    "2. 비관리형 매립",
+    "B. 하폐수처리",
+    "1. 폐수처리",
+    "2. 하수처리",
+    "C. 폐기물소각",
+    "D. 기타",
+    "별도항목(Memo Item)",
+    "분야·부문/연도",
+    "C. 국제벙커링 및 다국적 작전",
+    "1. 벙커링",
+    "a. 국제 항공",
+    "b. 국제 해운",
+    "2. 다국적 작전",
+    "* 참고 : NO = 배출활동 및 공정이 없는 경우, NE = 산정하지 아니하는 경우, NA = 자연적, "
+    "이론적으로 발생하지 않는 활동 및 공정의 경우, IE = 다른 항목에 포함하여 보고하는 경우, "
+    "C = 기밀정보인 경우",
+    "3. 타토지로 전용된 농경지",  # start of new codes in 2021 inventory
+    "4. 농경지로의 전용에 따른 N2O 배출",
+    "5. 농경지에서 농업용 석회시용으로 인한 CO2 배출",
+    "6. 농경지에서 바이오매스 연소에 의한 배출",
+    "G. 기타",
 ]
 translations = [
-    ['Total emissions', 'M.0.EL'],
-    ['Net emissions', '0'],
-    ['energy', '1'],
-    ['A. Fuel combustion', '1.A'],
-    ['1. Energy industry', '1.A.1'],
-    ['a. Public electricity and heat production', '1.A.1.a'],
-    ['b. Oil refining', '1.A.1.b'],
-    ['c. Solid fuel manufacturing and other energy industries', '1.A.1.c'],
-    ['2. Manufacturing and construction', '1.A.2'],
-    ['a. steel', '1.A.2.a'],
-    ['b. Non-ferrous metal', '1.A.2.b'],
-    ['c. chemistry', '1.A.2.c'],
-    ['d. Pulp, paper and printing', '1.A.2.d'],
-    ['e. Food and beverage processing and tobacco manufacturing', '1.A.2.e'],
-    ['f. Etc', '1.A.2.f'],
-    ['  1. Non-metal', '1.A.2.f.1'],
-    ['  2. Assembly metal', '1.A.2.f.2'],
-    ['  3. Wood and timber', '1.A.2.f.3'],
-    ['  4. Construction', '1.A.2.f.4'],
-    ['  5. Textile and leather', '1.A.2.f.5'],
-    ['  6. Other manufacturing', '1.A.2.f.6'],
-    ['3. Transportation', '1.A.3'],
-    ['a. Civil aviation', '1.A.3.a.2'],
-    ['b. Road transport', '1.A.3.b'],
-    ['c. railroad', '1.A.3.c'],
-    ['d. shipping', '1.A.3.d.2'],
-    ['e. Other transport', '1.A.3.e'],
-    ['4. Other', '1.A.4'],
-    ['a. Commercial/Public', '1.A.4.a'],
-    ['b. home', '1.A.4.b'],
-    ['c. Agriculture/Forestry/Fishing', '1.A.4.c'],
-    ['5. Uncategorized', '1.A.5'],
-    ['B. Talu', '1.B'],
-    ['1. Solid fuel', '1.B.1'],
-    ['2. Oil and natural gas', '1.B.2'],
-    ['a. oil', '1.B.2.a'],
-    ['b. Natural gas', '1.B.2.b'],
-    ['Industrial process', '2'],
-    ['A. Mineral industry', '2.A'],
-    ['1. Cement production', '2.A.1'],
-    ['2. Lime production', '2.A.2'],
-    ['3. Limestone and Dolomite Consumption', '2.A.3'],
-    ['4. Soda ash production and consumption', '2.A.4'],
-    ['5. Asphalt roofing', '2.A.5'],
-    ['6. Asphalt road pavement', '2.A.6'],
-    ['B. Chemical industry', '2.B'],
-    ['C. Metal Industry', '2.C'],
-    ['1. Steel production', '2.C.1'],
-    ['2. Ferroalloy production', '2.C.2'],
-    ['3. Aluminum production', '2.C.3'],
-    ['4. SF6 consumption in magnesium production', '2.C.4'],
-    ['D. Other industries', '2.D'],
-    ['E. Production of halocarbons and sulfur hexafluoride', '2.E'],
-    ['1. Emission of by-products', '2.E.1'],
-    ['2. Fugitive discharge', '2.E.2'],
-    ['F. Consumption of halocarbons and sulfur hexafluoride', '2.F'],
-    ['1. Refrigeration and cooling', '2.F.1'],
-    ['2. Foaming agent', '2.F.2'],
-    ['3. Fire extinguisher', '2.F.3'],
-    ['4. Aerosol', '2.F.4'],
-    ['5. Solvent', '2.F.5'],
-    ['6. Use of ODS substitutes for other purposes', '2.F.6'],
-    ['7. Semiconductor manufacturing', '2.F.7'],
-    ['8. Heavy electric machine', '2.F.8'],
-    ['9. Others (potential emissions)', '2.F.9'],
-    ['Agriculture', '4'],
-    ['A. Intestinal fermentation', '4.A'],
-    ['1. cow', '4.A.1'],
-    ['2. Water buffalo', '4.A.2'],
-    ['3. Sheep (Cotton Sheep)', '4.A.3'],
-    ['4. Sheep (Goat)', '4.A.4'],
-    ['5. Camel and Llama', '4.A.5'],
-    ['6. Horse', '4.A.6'],
-    ['7. Mules and Donkeys', '4.A.7'],
-    ['8. Pig', '4.A.8'],
-    ['9. Poultry', '4.A.9'],
-    ['10. Other livestock (deer)', '4.A.10'],
-    ['B. Livestock manure treatment', '4.B'],
-    ['1. cow', '4.B.1'],
-    ['2. Water buffalo', '4.B.2'],
-    ['3. Sheep (Cotton Sheep)', '4.B.3'],
-    ['4. Sheep (Goat)', '4.B.4'],
-    ['5. Camel and Llama', '4.B.5'],
-    ['6. Horse', '4.B.6'],
-    ['7. Mules and Donkeys', '4.B.7'],
-    ['8. Pig', '4.B.8'],
-    ['9. Poultry', '4.B.9'],
-    ['10. Other livestock (deer)', '4.B.10'],
-    ['C. Rice cultivation', '4.C'],
-    ['1. irrigation', '4.C.1'],
-    ['2. Thousand answers', '4.C.4'],
-    ['D. Cropland soil', '4.D'],
-    ['1. Direct discharge', '4.D.1'],
-    ['2. Ranch, grazing area, manure (manure)', '4.D.2'],
-    ['3. Indirect emissions', '4.D.3'],
-    ['E. Savannah incineration', '4.E'],
-    ['F. Crop residue incineration', '4.F'],
-    ['1. Grains', '4.F.1'],
-    ['2. Beans (beans)', '4.F.2'],
-    ['3. Root vegetables', '4.F.3'],
-    ['4. Sugar cane', '4.F.4'],
-    ['5. Other', '4.F.5'],
-    ['LULUCF', '5'],
-    ['A. Forest land', '5.A'],
-    ['1. Forest land maintained as a forest land', '5.A.1'],  # categories differ from IPCC1996
-    ['2. Forest land converted from other lands', '5.A.2'],  # categories differ from IPCC1996
-    ['3. N2O emissions from nitrogen fertilization in forest areas', '5.A.3'],  # categories differ from IPCC1996
-    ['4. Non-CO2 emission due to drainage in forest areas', '5.A.4'],  # categories differ from IPCC1996
-    ['5. Emissions from biomass combustion in forest areas', '5.A.5'],  # categories differ from IPCC1996
-    ['B. Cropland', '5.B'],
-    ['1. Agricultural land maintained as agricultural land', '5.B.1'],  # categories differ from IPCC1996
-    ['2. Cropland converted from other lands', '5.B.2'],  # categories differ from IPCC1996
-    ['3. N2O emission due to conversion to agricultural land', '5.B.3'],  # categories differ from IPCC1996
-    ['4. CO2 emission from agricultural lime application in agricultural land', '5.B.4'],  # categories differ from IPCC1996
-    ['5. Emissions from biomass combustion in agricultural land', '5.B.5'],  # categories differ from IPCC1996
-    ['C. Grassland', '5.C'],
-    ['1. Grassland maintained as grassland', '5.C.1'],  # categories differ from IPCC1996
-    ['2. Grassland dedicated to Tatoji', '5.C.2'],  # categories differ from IPCC1996
-    ['3. CO2 emission from agricultural lime application in grassland', '5.C.3'],  # categories differ from IPCC1996
-    ['4. Emissions from biomass combustion in grassland', '5.C.4'],  # categories differ from IPCC1996
-    ['D. Wetlands', '5.D'],
-    ['1. Wetlands maintained as wetlands', '5.D.1'],  # categories differ from IPCC1996
-    ['2. Wetlands converted from Tatoji', '5.D.2'],  # categories differ from IPCC1996
-    ['3. Non-CO2 emission due to drainage in wetlands', '5.D.3'],  # categories differ from IPCC1996
-    ['4. Emissions from biomass combustion in wetlands', '5.D.4'],  # categories differ from IPCC1996
-    ['E. Jeongju-ji', '5.E'],
-    ['F. Other land', '5.F'],
-    ['waste', '6'],
-    ['A. Landfill of waste', '6.A'],
-    ['1. Managed landfill', '6.A.1'],
-    ['2. Unmanaged landfill', '6.A.2'],
-    ['B. Sewage water treatment', '6.B'],
-    ['1. Wastewater treatment', '6.B.1'],  # categories differ from IPCC1996
-    ['2. Sewage treatment', '6.B.2'],  # categories differ from IPCC1996
-    ['C. Waste incineration', '6.C'],
-    ['D. Other', '6.D'],
-    ['Memo Item', '\\IGNORE'],
-    ['Field·Sector/Year', '\\IGNORE'],
-    ['C. International bunkering and multinational operations', '\\IGNORE'],
-    ['1. Bunkering', 'M.1'],
-    ['a. International aviation', 'M.1.A'],
-    ['b. International shipping', 'M.1.B'],
-    ['2. Multinational operations', 'M.2'],
-    ['', '\\IGNORE'],
-    ['3. Farmland converted to Tato land', '5.B.3'],  # new codes in 2021 inventory start here
-    ['4. N2O emission due to conversion to agricultural land', '5.B.4'],
-    ['5. CO2 emission from agricultural lime application in agricultural land', '5.B.5'],
-    ['6. Emissions from burning biomass on agricultural land', '5.B.6'],
-    ['G. Others', '5.G'],
+    ["Total emissions", "M.0.EL"],
+    ["Net emissions", "0"],
+    ["energy", "1"],
+    ["A. Fuel combustion", "1.A"],
+    ["1. Energy industry", "1.A.1"],
+    ["a. Public electricity and heat production", "1.A.1.a"],
+    ["b. Oil refining", "1.A.1.b"],
+    ["c. Solid fuel manufacturing and other energy industries", "1.A.1.c"],
+    ["2. Manufacturing and construction", "1.A.2"],
+    ["a. steel", "1.A.2.a"],
+    ["b. Non-ferrous metal", "1.A.2.b"],
+    ["c. chemistry", "1.A.2.c"],
+    ["d. Pulp, paper and printing", "1.A.2.d"],
+    ["e. Food and beverage processing and tobacco manufacturing", "1.A.2.e"],
+    ["f. Etc", "1.A.2.f"],
+    ["  1. Non-metal", "1.A.2.f.1"],
+    ["  2. Assembly metal", "1.A.2.f.2"],
+    ["  3. Wood and timber", "1.A.2.f.3"],
+    ["  4. Construction", "1.A.2.f.4"],
+    ["  5. Textile and leather", "1.A.2.f.5"],
+    ["  6. Other manufacturing", "1.A.2.f.6"],
+    ["3. Transportation", "1.A.3"],
+    ["a. Civil aviation", "1.A.3.a.2"],
+    ["b. Road transport", "1.A.3.b"],
+    ["c. railroad", "1.A.3.c"],
+    ["d. shipping", "1.A.3.d.2"],
+    ["e. Other transport", "1.A.3.e"],
+    ["4. Other", "1.A.4"],
+    ["a. Commercial/Public", "1.A.4.a"],
+    ["b. home", "1.A.4.b"],
+    ["c. Agriculture/Forestry/Fishing", "1.A.4.c"],
+    ["5. Uncategorized", "1.A.5"],
+    ["B. Talu", "1.B"],
+    ["1. Solid fuel", "1.B.1"],
+    ["2. Oil and natural gas", "1.B.2"],
+    ["a. oil", "1.B.2.a"],
+    ["b. Natural gas", "1.B.2.b"],
+    ["Industrial process", "2"],
+    ["A. Mineral industry", "2.A"],
+    ["1. Cement production", "2.A.1"],
+    ["2. Lime production", "2.A.2"],
+    ["3. Limestone and Dolomite Consumption", "2.A.3"],
+    ["4. Soda ash production and consumption", "2.A.4"],
+    ["5. Asphalt roofing", "2.A.5"],
+    ["6. Asphalt road pavement", "2.A.6"],
+    ["B. Chemical industry", "2.B"],
+    ["C. Metal Industry", "2.C"],
+    ["1. Steel production", "2.C.1"],
+    ["2. Ferroalloy production", "2.C.2"],
+    ["3. Aluminum production", "2.C.3"],
+    ["4. SF6 consumption in magnesium production", "2.C.4"],
+    ["D. Other industries", "2.D"],
+    ["E. Production of halocarbons and sulfur hexafluoride", "2.E"],
+    ["1. Emission of by-products", "2.E.1"],
+    ["2. Fugitive discharge", "2.E.2"],
+    ["F. Consumption of halocarbons and sulfur hexafluoride", "2.F"],
+    ["1. Refrigeration and cooling", "2.F.1"],
+    ["2. Foaming agent", "2.F.2"],
+    ["3. Fire extinguisher", "2.F.3"],
+    ["4. Aerosol", "2.F.4"],
+    ["5. Solvent", "2.F.5"],
+    ["6. Use of ODS substitutes for other purposes", "2.F.6"],
+    ["7. Semiconductor manufacturing", "2.F.7"],
+    ["8. Heavy electric machine", "2.F.8"],
+    ["9. Others (potential emissions)", "2.F.9"],
+    ["Agriculture", "4"],
+    ["A. Intestinal fermentation", "4.A"],
+    ["1. cow", "4.A.1"],
+    ["2. Water buffalo", "4.A.2"],
+    ["3. Sheep (Cotton Sheep)", "4.A.3"],
+    ["4. Sheep (Goat)", "4.A.4"],
+    ["5. Camel and Llama", "4.A.5"],
+    ["6. Horse", "4.A.6"],
+    ["7. Mules and Donkeys", "4.A.7"],
+    ["8. Pig", "4.A.8"],
+    ["9. Poultry", "4.A.9"],
+    ["10. Other livestock (deer)", "4.A.10"],
+    ["B. Livestock manure treatment", "4.B"],
+    ["1. cow", "4.B.1"],
+    ["2. Water buffalo", "4.B.2"],
+    ["3. Sheep (Cotton Sheep)", "4.B.3"],
+    ["4. Sheep (Goat)", "4.B.4"],
+    ["5. Camel and Llama", "4.B.5"],
+    ["6. Horse", "4.B.6"],
+    ["7. Mules and Donkeys", "4.B.7"],
+    ["8. Pig", "4.B.8"],
+    ["9. Poultry", "4.B.9"],
+    ["10. Other livestock (deer)", "4.B.10"],
+    ["C. Rice cultivation", "4.C"],
+    ["1. irrigation", "4.C.1"],
+    ["2. Thousand answers", "4.C.4"],
+    ["D. Cropland soil", "4.D"],
+    ["1. Direct discharge", "4.D.1"],
+    ["2. Ranch, grazing area, manure (manure)", "4.D.2"],
+    ["3. Indirect emissions", "4.D.3"],
+    ["E. Savannah incineration", "4.E"],
+    ["F. Crop residue incineration", "4.F"],
+    ["1. Grains", "4.F.1"],
+    ["2. Beans (beans)", "4.F.2"],
+    ["3. Root vegetables", "4.F.3"],
+    ["4. Sugar cane", "4.F.4"],
+    ["5. Other", "4.F.5"],
+    ["LULUCF", "5"],
+    ["A. Forest land", "5.A"],
+    [
+        "1. Forest land maintained as a forest land",
+        "5.A.1",
+    ],  # categories differ from IPCC1996
+    [
+        "2. Forest land converted from other lands",
+        "5.A.2",
+    ],  # categories differ from IPCC1996
+    [
+        "3. N2O emissions from nitrogen fertilization in forest areas",
+        "5.A.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Non-CO2 emission due to drainage in forest areas",
+        "5.A.4",
+    ],  # categories differ from IPCC1996
+    [
+        "5. Emissions from biomass combustion in forest areas",
+        "5.A.5",
+    ],  # categories differ from IPCC1996
+    ["B. Cropland", "5.B"],
+    [
+        "1. Agricultural land maintained as agricultural land",
+        "5.B.1",
+    ],  # categories differ from IPCC1996
+    [
+        "2. Cropland converted from other lands",
+        "5.B.2",
+    ],  # categories differ from IPCC1996
+    [
+        "3. N2O emission due to conversion to agricultural land",
+        "5.B.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. CO2 emission from agricultural lime application in agricultural land",
+        "5.B.4",
+    ],  # categories differ from IPCC1996
+    [
+        "5. Emissions from biomass combustion in agricultural land",
+        "5.B.5",
+    ],  # categories differ from IPCC1996
+    ["C. Grassland", "5.C"],
+    [
+        "1. Grassland maintained as grassland",
+        "5.C.1",
+    ],  # categories differ from IPCC1996
+    ["2. Grassland dedicated to Tatoji", "5.C.2"],  # categories differ from IPCC1996
+    [
+        "3. CO2 emission from agricultural lime application in grassland",
+        "5.C.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Emissions from biomass combustion in grassland",
+        "5.C.4",
+    ],  # categories differ from IPCC1996
+    ["D. Wetlands", "5.D"],
+    ["1. Wetlands maintained as wetlands", "5.D.1"],  # categories differ from IPCC1996
+    ["2. Wetlands converted from Tatoji", "5.D.2"],  # categories differ from IPCC1996
+    [
+        "3. Non-CO2 emission due to drainage in wetlands",
+        "5.D.3",
+    ],  # categories differ from IPCC1996
+    [
+        "4. Emissions from biomass combustion in wetlands",
+        "5.D.4",
+    ],  # categories differ from IPCC1996
+    ["E. Jeongju-ji", "5.E"],
+    ["F. Other land", "5.F"],
+    ["waste", "6"],
+    ["A. Landfill of waste", "6.A"],
+    ["1. Managed landfill", "6.A.1"],
+    ["2. Unmanaged landfill", "6.A.2"],
+    ["B. Sewage water treatment", "6.B"],
+    ["1. Wastewater treatment", "6.B.1"],  # categories differ from IPCC1996
+    ["2. Sewage treatment", "6.B.2"],  # categories differ from IPCC1996
+    ["C. Waste incineration", "6.C"],
+    ["D. Other", "6.D"],
+    ["Memo Item", "\\IGNORE"],
+    ["Field·Sector/Year", "\\IGNORE"],
+    ["C. International bunkering and multinational operations", "\\IGNORE"],
+    ["1. Bunkering", "M.1"],
+    ["a. International aviation", "M.1.A"],
+    ["b. International shipping", "M.1.B"],
+    ["2. Multinational operations", "M.2"],
+    ["", "\\IGNORE"],
+    [
+        "3. Farmland converted to Tato land",
+        "5.B.3",
+    ],  # new codes in 2021 inventory start here
+    ["4. N2O emission due to conversion to agricultural land", "5.B.4"],
+    [
+        "5. CO2 emission from agricultural lime application in agricultural land",
+        "5.B.5",
+    ],
+    ["6. Emissions from burning biomass on agricultural land", "5.B.6"],
+    ["G. Others", "5.G"],
 ]
 cat_name_translations = dict(zip(original_names, [cat[0] for cat in translations]))
 cat_codes = dict(zip(original_names, [cat[1] for cat in translations]))
 
 remove_cats = [
-    '1.A.1.a', '1.A.1.b', '1.A.1.c', '1.A.2.f',
-    '2.A', '2.D',
-    '2.F', '2.G',
-    '4.C.1', '4.C.4',
-    '4.D',
-    '4.F.1', '4.F.2', '4.F.3', '4.F.4', '4.F.5',  # detail not in 2006 categories
-    '5.A', '5.A.1', '5.A.2', '5.A.3', '5.A.4', '5.A.5',  # don't not match IPCC
+    "1.A.1.a",
+    "1.A.1.b",
+    "1.A.1.c",
+    "1.A.2.f",
+    "2.A",
+    "2.D",
+    "2.F",
+    "2.G",
+    "4.C.1",
+    "4.C.4",
+    "4.D",
+    "4.F.1",
+    "4.F.2",
+    "4.F.3",
+    "4.F.4",
+    "4.F.5",  # detail not in 2006 categories
+    "5.A",
+    "5.A.1",
+    "5.A.2",
+    "5.A.3",
+    "5.A.4",
+    "5.A.5",  # don't not match IPCC
     # categories
-    '5.B', '5.B.1', '5.B.2', '5.B.3', '5.B.4', '5.B.5',
-    '5.C', '5.C.1', '5.C.2', '5.C.3', '5.C.4',
-    '5.D', '5.D.1', '5.D.2', '5.D.3', '5.D.4',
-    '5.E', '5.F',
-    '5.G', '5.B.6', # for 2021 NIR
+    "5.B",
+    "5.B.1",
+    "5.B.2",
+    "5.B.3",
+    "5.B.4",
+    "5.B.5",
+    "5.C",
+    "5.C.1",
+    "5.C.2",
+    "5.C.3",
+    "5.C.4",
+    "5.D",
+    "5.D.1",
+    "5.D.2",
+    "5.D.3",
+    "5.D.4",
+    "5.E",
+    "5.F",
+    "5.G",
+    "5.B.6",  # for 2021 NIR
 ]
 
 aggregate_before_mapping = {
-    '2006.2.D.4': {'sources': ['2.A.5', '2.A.6'], 'name': 'Other'},
-    '2006.3.C.4': {'sources': ['4.D.1', '4.D.2'],
-                   'name': 'Direct N2O Emissions from Managed Soils'},
-    '2006.M.3C1AG': {'sources': ['4.E', '4.F'], 'name': 'Biomass burning Agriculture'},
-    '2006.1.A.2.m': {'sources': ['1.A.2.f.2', '1.A.2.f.6'], 'name': 'Other'},
+    "2006.2.D.4": {"sources": ["2.A.5", "2.A.6"], "name": "Other"},
+    "2006.3.C.4": {
+        "sources": ["4.D.1", "4.D.2"],
+        "name": "Direct N2O Emissions from Managed Soils",
+    },
+    "2006.M.3C1AG": {"sources": ["4.E", "4.F"], "name": "Biomass burning Agriculture"},
+    "2006.1.A.2.m": {"sources": ["1.A.2.f.2", "1.A.2.f.6"], "name": "Other"},
 }
 
 cat_mapping = {
-    '1.A.2.f.1': '1.A.2.f',
-    '1.A.2.f.3': '1.A.2.j',
-    '1.A.2.f.4': '1.A.2.k',
-    '1.A.2.f.5': '1.A.2.l',
-    '2006.1.A.2.m': '1.A.2.m',
-    '2.A.4': '2.B.7',  # add to 2.B
-    '2.A.3': '2.A.4',
-    '2.D': '2.H',
-    '2006.2.D.4': '2.D.4',
-    '2.E': '2.B.9',  # add to 2.B
-    '2.E.1': '2.B.9.a',
-    '2.E.2': '2.B.9.b',
+    "1.A.2.f.1": "1.A.2.f",
+    "1.A.2.f.3": "1.A.2.j",
+    "1.A.2.f.4": "1.A.2.k",
+    "1.A.2.f.5": "1.A.2.l",
+    "2006.1.A.2.m": "1.A.2.m",
+    "2.A.4": "2.B.7",  # add to 2.B
+    "2.A.3": "2.A.4",
+    "2.D": "2.H",
+    "2006.2.D.4": "2.D.4",
+    "2.E": "2.B.9",  # add to 2.B
+    "2.E.1": "2.B.9.a",
+    "2.E.2": "2.B.9.b",
     #    '2.F', # remove?
-    '2.F.1': '2.F.1',  # just added here to avoid confusion
+    "2.F.1": "2.F.1",  # just added here to avoid confusion
     #    '2.F.2', '2.F.3', '2.F.4', '2.F.5',
-    '2.F.6': '2.E_1',
-    '2.F.7': '2.E_2',
-    '2.F.8': '2.G.1',
-    '2.F.9': '2.G.2',
-    '4': 'M.AG',
-    '4.A': '3.A.1',
-    '4.A.1': '3.A.1.a',
-    '4.A.2': '3.A.1.b',
-    '4.A.3': '3.A.1.c',
-    '4.A.4': '3.A.1.d',
-    '4.A.5': '3.A.1.e',
-    '4.A.6': '3.A.1.f',
-    '4.A.7': '3.A.1.g',
-    '4.A.8': '3.A.1.h',
-    '4.A.9': '3.A.1.i',
-    '4.A.10': '3.A.1.j',
-    '4.B': '3.A.2',
-    '4.B.1': '3.A.2.a',
-    '4.B.2': '3.A.2.b',
-    '4.B.3': '3.A.2.c',
-    '4.B.4': '3.A.2.d',
-    '4.B.5': '3.A.2.e',
-    '4.B.6': '3.A.2.f',
-    '4.B.7': '3.A.2.g',
-    '4.B.8': '3.A.2.h',
-    '4.B.9': '3.A.2.i',
-    '4.B.10': '3.A.2.j',
-    '4.C': '3.C.7',
-    '2006.3.C.4': '3.C.4',
-    '4.D.3': '3.C.5',
-    '2006.M.3C1AG': 'M.3.C.1.AG',
-    '5': 'M.LULUCF',
-    '6': '4',
-    '6.A': '4.A',
-    '6.A.1': '4.A.1',
-    '6.A.2': '4.A.2',
-    '6.B': '4.D',
-    '6.B.1': '4.D.1',
-    '6.B.2': '4.D.2',
-    '6.C': '4.C.1',
-    '6.D': '4.E',
-    'M.1': 'M.BK',
-    'M.1.A': 'M.BK.A',
-    'M.1.B': 'M.BK.M',
+    "2.F.6": "2.E_1",
+    "2.F.7": "2.E_2",
+    "2.F.8": "2.G.1",
+    "2.F.9": "2.G.2",
+    "4": "M.AG",
+    "4.A": "3.A.1",
+    "4.A.1": "3.A.1.a",
+    "4.A.2": "3.A.1.b",
+    "4.A.3": "3.A.1.c",
+    "4.A.4": "3.A.1.d",
+    "4.A.5": "3.A.1.e",
+    "4.A.6": "3.A.1.f",
+    "4.A.7": "3.A.1.g",
+    "4.A.8": "3.A.1.h",
+    "4.A.9": "3.A.1.i",
+    "4.A.10": "3.A.1.j",
+    "4.B": "3.A.2",
+    "4.B.1": "3.A.2.a",
+    "4.B.2": "3.A.2.b",
+    "4.B.3": "3.A.2.c",
+    "4.B.4": "3.A.2.d",
+    "4.B.5": "3.A.2.e",
+    "4.B.6": "3.A.2.f",
+    "4.B.7": "3.A.2.g",
+    "4.B.8": "3.A.2.h",
+    "4.B.9": "3.A.2.i",
+    "4.B.10": "3.A.2.j",
+    "4.C": "3.C.7",
+    "2006.3.C.4": "3.C.4",
+    "4.D.3": "3.C.5",
+    "2006.M.3C1AG": "M.3.C.1.AG",
+    "5": "M.LULUCF",
+    "6": "4",
+    "6.A": "4.A",
+    "6.A.1": "4.A.1",
+    "6.A.2": "4.A.2",
+    "6.B": "4.D",
+    "6.B.1": "4.D.1",
+    "6.B.2": "4.D.2",
+    "6.C": "4.C.1",
+    "6.D": "4.E",
+    "M.1": "M.BK",
+    "M.1.A": "M.BK.A",
+    "M.1.B": "M.BK.M",
 }
 
 aggregate_after_mapping = {
-    '1.A.3.a': {'sources': ['1.A.3.a.2'], 'name': 'Civil Aviation'},  # aviation
-    '1.A.3.d': {'sources': ['1.A.3.d.2'], 'name': 'Water-borne Navigation'},  # shipping
-    '2.A': {'sources': ['2.A.1', '2.A.2', '2.A.4', '2.A.5', '2.A.6'],
-            'name': 'Mineral Industry'},
-    '2.B': {'sources': ['2.B', '2.B.7', '2.B.9'], 'name': 'Chemical Industry'},
-    '2.D': {'sources': ['2.D.4'], 'name': 'Other'},
-    '2.E': {'sources': ['2.E_1', '2.E_2'], 'name': 'Electronics Industry'},
-    '2.F': {'sources': ['2.F.1', '2.F.2', '2.F.3', '2.F.4', '2.F.5'],
-            'name': 'Product uses as Substitutes for Ozone Depleting Substances'},
-    '2.G': {'sources': ['2.G.1', '2.G.2'], 'name': 'Other Product Manufacture and Use'},
-    '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-    '3.C': {'sources': ['3.C.4', '3.C.5', '3.C.7'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-    'M.3.C.AG': {'sources': ['3.C.4', '3.C.5', '3.C.7'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land ('
-                         'Agriculture)'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
-    '4.C': {'sources': ['4.C.1'], 'name': 'Incineration and Open Burning of Waste'},
+    "1.A.3.a": {"sources": ["1.A.3.a.2"], "name": "Civil Aviation"},  # aviation
+    "1.A.3.d": {"sources": ["1.A.3.d.2"], "name": "Water-borne Navigation"},  # shipping
+    "2.A": {
+        "sources": ["2.A.1", "2.A.2", "2.A.4", "2.A.5", "2.A.6"],
+        "name": "Mineral Industry",
+    },
+    "2.B": {"sources": ["2.B", "2.B.7", "2.B.9"], "name": "Chemical Industry"},
+    "2.D": {"sources": ["2.D.4"], "name": "Other"},
+    "2.E": {"sources": ["2.E_1", "2.E_2"], "name": "Electronics Industry"},
+    "2.F": {
+        "sources": ["2.F.1", "2.F.2", "2.F.3", "2.F.4", "2.F.5"],
+        "name": "Product uses as Substitutes for Ozone Depleting Substances",
+    },
+    "2.G": {"sources": ["2.G.1", "2.G.2"], "name": "Other Product Manufacture and Use"},
+    "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+    "3.C": {
+        "sources": ["3.C.4", "3.C.5", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land",
+    },
+    "M.3.C.AG": {
+        "sources": ["3.C.4", "3.C.5", "3.C.7"],
+        "name": "Aggregate sources and non-CO2 emissions sources on land ("
+        "Agriculture)",
+    },
+    "M.AG.ELV": {"sources": ["M.3.C.AG"], "name": "Agriculture excluding livestock"},
+    "4.C": {"sources": ["4.C.1"], "name": "Incineration and Open Burning of Waste"},
 }
 
 coords_terminologies_2006 = {
@@ -422,27 +522,35 @@ filter_remove_2006 = {
     },
     "livestock": {  # temp until double cat name problem is solved
         "category (IPCC2006_PRIMAP)": [
-            '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-            '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+            "4.B.1",
+            "4.B.10",
+            "4.B.2",
+            "4.B.3",
+            "4.B.4",
+            "4.B.5",
+            "4.B.6",
+            "4.B.7",
+            "4.B.8",
+            "4.B.9",
         ]
     },
-    "fmap": {
-        "category (IPCC2006_PRIMAP)": remove_cats
-    },
+    "fmap": {"category (IPCC2006_PRIMAP)": remove_cats},
     "f_bef_map": {
         "category (IPCC2006_PRIMAP)": [
-            '2.A.5', '2.A.6',  # combined to 2006.2.D.4
-            '4.D.1', '4.D.2',  # combined to 2006.3.C.4
-            '4.E', '4.F',  # 2006.M.3.C.1.AG
-            '1.A.2.f.2', '1.A.2.f.6',  # 2006.1.A.2.m
+            "2.A.5",
+            "2.A.6",  # combined to 2006.2.D.4
+            "4.D.1",
+            "4.D.2",  # combined to 2006.3.C.4
+            "4.E",
+            "4.F",  # 2006.M.3.C.1.AG
+            "1.A.2.f.2",
+            "1.A.2.f.6",  # 2006.1.A.2.m
         ]
-    }
+    },
 }
 
 filter_remove_after_agg = {
     "tempCats": {
-        "category (IPCC2006_PRIMAP)": [
-            "2.E_1", "2.E_2"
-        ],
+        "category (IPCC2006_PRIMAP)": ["2.E_1", "2.E_2"],
     },
 }

+ 125 - 76
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py

@@ -1,12 +1,18 @@
-# this script reads data from Korea's 2021 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's 2021 inventory from xlsx
+
+This script reads data from Korea's 2021 national inventory
+Data are read from the xlsx file
+
+"""
+
 
 import os
 import sys
 
 import pandas as pd
 import primap2 as pm2
-from .config_kor_bur4 import (
+from config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_before_mapping,
     cat_codes,
@@ -24,42 +30,43 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2021-Inventory'
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2021-Inventory"
+    )
+    output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'KOR_2021-Inventory_2021_'
+    output_filename = "KOR_2021-Inventory_2021_"
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2019).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2019).xlsx"
     years_to_read = range(1990, 2019 + 1)
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2019 - 1990 + 3)
 
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
 
     # definitions for conversion to interchange format
@@ -73,7 +80,7 @@ if __name__ == "__main__":
 
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
 
     coords_terminologies = {
@@ -99,12 +106,20 @@ if __name__ == "__main__":
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
-        "livestock": { # temp until double cat name problem is solved
+        "livestock": {  # temp until double cat name problem is solved
             "category (IPCC1996_KOR_INV)": [
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
             ]
-        }
+        },
     }
 
     filter_keep = {}
@@ -115,7 +130,8 @@ if __name__ == "__main__":
         "contact": "mail@johannes-guetschow.de",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2021",
         "comment": "Read fom xlsx file by Johannes Gütschow",
-        "institution": "Republic of Korea, Ministry of Environment, Greenhouse Gas Inventory and Research Center",
+        "institution": "Republic of Korea, Ministry of Environment, "
+        "Greenhouse Gas Inventory and Research Center",
     }
 
     cols_for_space_stripping = []
@@ -135,11 +151,17 @@ if __name__ == "__main__":
 
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=146, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=146,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # add columns
@@ -153,7 +175,7 @@ if __name__ == "__main__":
 
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
@@ -172,20 +194,22 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
-    copy_df=True, # we need the unchanged DF for the conversion step
-        )
+        copy_df=True,  # we need the unchanged DF for the conversion step
+    )
 
     filter_data(data_if, filter_remove=filter_remove)
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
-    data_pm2 = data_pm2.reset_coords(["orig_cat_name", "cat_name_translation"], drop=True)
+    data_pm2 = data_pm2.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if = data_pm2.pr.to_interchange_format()
 
     # ###
@@ -193,17 +217,20 @@ if __name__ == "__main__":
     # ###
     if not output_folder.exists():
         output_folder.mkdir()
-    #pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
-    #data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     # ###
     # conversion to ipcc 2006 categories
     # ###
-
-
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         coords_cols=coords_cols,
@@ -216,21 +243,23 @@ if __name__ == "__main__":
         copy_df=True,  # don't mess up the dataframe when testing
     )
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     # agg before mapping
 
     for cat_to_agg in aggregate_before_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_before_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_before_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -238,20 +267,25 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_before_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
@@ -268,17 +302,19 @@ if __name__ == "__main__":
     # agg after mapping
 
     for cat_to_agg in aggregate_after_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_after_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_after_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -286,36 +322,49 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
 
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_after_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
-    data_pm2_2006 = data_pm2_2006.reset_coords(["orig_cat_name", "cat_name_translation"],
-                                           drop=True)
+    data_pm2_2006 = data_pm2_2006.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     # save IPCC2006 data
 
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 140 - 82
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py

@@ -1,12 +1,17 @@
-# this script reads data from Korea's 2021 national inventory which is underlying BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's 2021 inventory from xlsx
+
+This script reads data from Korea's 2022 national inventory
+Data are read from the xlsx file
+
+"""
 
 import os
 import sys
 
 import pandas as pd
 import primap2 as pm2
-from .config_kor_bur4 import (
+from config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_before_mapping,
     cat_codes,
@@ -24,42 +29,43 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2022-Inventory'
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2022-Inventory"
+    )
+    output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'KOR_2022-Inventory_2022_'
+    output_filename = "KOR_2022-Inventory_2022_"
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2020).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2020).xlsx"
     years_to_read = range(1990, 2020 + 1)
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2020 - 1990 + 3)
 
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
 
     # definitions for conversion to interchange format
@@ -73,7 +79,7 @@ if __name__ == "__main__":
 
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
 
     coords_terminologies = {
@@ -99,12 +105,20 @@ if __name__ == "__main__":
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
-        "livestock": { # temp until double cat name problem is solved
+        "livestock": {  # temp until double cat name problem is solved
             "category (IPCC1996_KOR_INV)": [
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
             ]
-        }
+        },
     }
 
     filter_keep = {}
@@ -115,11 +129,10 @@ if __name__ == "__main__":
         "contact": "mail@johannes-guetschow.de",
         "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2022",
         "comment": "Read fom xlsx file by Johannes Gütschow",
-        "institution": "Republic of Korea, Ministry of Environment, Greenhouse Gas Inventory and Research Center",
+        "institution": "Republic of Korea, Ministry of Environment, "
+        "Greenhouse Gas Inventory and Research Center",
     }
 
-
-
     cols_for_space_stripping = []
 
     compression = dict(zlib=True, complevel=9)
@@ -137,11 +150,17 @@ if __name__ == "__main__":
 
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=146, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=146,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # make sure all col headers are str
@@ -157,14 +176,12 @@ if __name__ == "__main__":
 
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
     df_all["cat_name_translation"] = df_all["category"]
 
-
-
     # ###
     # convert to PRIMAP2 interchange format
     # ###
@@ -175,20 +192,22 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
-        copy_df=True, # we need the unchanged DF for the conversion step
-        )
+        copy_df=True,  # we need the unchanged DF for the conversion step
+    )
 
     filter_data(data_if, filter_remove=filter_remove)
 
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     # convert back to IF to have units in the fixed format
-    data_pm2 = data_pm2.reset_coords(["orig_cat_name", "cat_name_translation"], drop=True)
+    data_pm2 = data_pm2.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if = data_pm2.pr.to_interchange_format()
 
     # ###
@@ -196,17 +215,21 @@ if __name__ == "__main__":
     # ###
     if not output_folder.exists():
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     data_pm2 = pm2.pm2io.from_interchange_format(data_if)
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
 
     # ###
     # conversion to ipcc 2006 categories
     # ###
 
-
     data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
         df_all,
         coords_cols=coords_cols,
@@ -219,21 +242,23 @@ if __name__ == "__main__":
         copy_df=True,  # don't mess up the dataframe when testing
     )
 
-    cat_label = 'category (' + coords_terminologies_2006["category"] + ')'
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
     # agg before mapping
 
     for cat_to_agg in aggregate_before_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_before_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_before_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -241,20 +266,32 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name", "cat_name_translation"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_before_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
             if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
@@ -271,17 +308,19 @@ if __name__ == "__main__":
     # agg after mapping
 
     for cat_to_agg in aggregate_after_mapping:
-        mask = data_if_2006[cat_label].isin(aggregate_after_mapping[cat_to_agg]["sources"])
+        mask = data_if_2006[cat_label].isin(
+            aggregate_after_mapping[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -289,37 +328,56 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum()
-
-            df_combine = df_combine.drop(columns=["category (IPCC2006_PRIMAP)", "orig_cat_name", "cat_name_translation"])
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
             df_combine.insert(0, cat_label, cat_to_agg)
-            df_combine.insert(1, "orig_cat_name",
-                              aggregate_after_mapping[cat_to_agg]["name"])
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
+            )
 
             df_combine = df_combine.reset_index()
 
             if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
-                filter_this_cat = {
-                    "f": {cat_label: cat_to_agg}
-                }
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
                 filter_data(data_if_2006, filter_remove=filter_this_cat)
 
             data_if_2006 = pd.concat([data_if_2006, df_combine])
         else:
             print(f"no data to aggregate category {cat_to_agg}")
 
-
-    #conversion to PRIMAP2 native format
+    # conversion to PRIMAP2 native format
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
     # convert back to IF to have units in the fixed format
-    data_pm2_2006 = data_pm2_2006.reset_coords(["orig_cat_name", "cat_name_translation"],
-                                           drop=True)
+    data_pm2_2006 = data_pm2_2006.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
     data_if_2006 = data_pm2_2006.pr.to_interchange_format()
     # save IPCC2006 data
 
     filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies_2006["category"]), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
 
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf(output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"), encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 75 - 47
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py

@@ -1,12 +1,17 @@
-# this script reads data from Korea's BUR4
-# Data is read from the xlsx file
+"""
+Read Korea's BUR4 from xlsx
+
+This script reads data from Korea's 2020 national inventory which is underlying BUR4
+Data are read from the xlsx file
+
+"""
 
 import os
 import sys
 
 import pandas as pd
 import primap2 as pm2
-from .config_kor_bur4 import cat_codes, cat_name_translations
+from config_kor_bur4 import cat_codes, cat_name_translations
 from primap2.pm2io._data_reading import filter_data
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -15,42 +20,43 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
-                   '2020-Inventory'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2020-Inventory"
+    )
+    output_folder = extracted_data_path / "UNFCCC" / "Republic_of_Korea"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'KOR_BUR4_2021_'
+    output_filename = "KOR_BUR4_2021_"
 
-    inventory_file = 'Republic_of_Korea_National_GHG_Inventory_(1990_2018).xlsx'
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2018).xlsx"
     years_to_read = range(1990, 2018 + 1)
 
-    sheets_to_read = ['온실가스', 'CO2', 'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6']
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
     cols_to_read = range(1, 2018 - 1990 + 3)
 
     # columns for category code and original category name
-    index_cols = ['분야·부문/연도']
+    index_cols = ["분야·부문/연도"]
 
     sheet_metadata = {
-        'entity': {
-            '온실가스': 'KYOTOGHG (SARGWP100)',
-            'CO2': 'CO2',
-            'CH4': 'CH4 (SARGWP100)',
-            'N2O': 'N2O (SARGWP100)',
-            'HFCs': 'HFCS (SARGWP100)',
-            'PFCs': 'PFCS (SARGWP100)',
-            'SF6': 'SF6 (SARGWP100)',
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
         },
-        'unit': {
-            '온실가스': 'Gg CO2 / yr',
-            'CO2': 'Gg CO2 / yr',
-            'CH4': 'Gg CO2 / yr',
-            'N2O': 'Gg CO2 / yr',
-            'HFCs': 'Gg CO2 / yr',
-            'PFCs': 'Gg CO2 / yr',
-            'SF6': 'Gg CO2 / yr',
-        }
     }
 
     # definitions for conversion to interchange format
@@ -64,7 +70,7 @@ if __name__ == "__main__":
 
     add_coords_cols = {
         "orig_cat_name": ["orig_cat_name", "category"],
-        "cat_name_translation": ["cat_name_translation", "category"]
+        "cat_name_translation": ["cat_name_translation", "category"],
     }
 
     coords_terminologies = {
@@ -90,21 +96,32 @@ if __name__ == "__main__":
         "f1": {
             "category (IPCC1996_KOR_INV)": "\\IGNORE",
         },
-        "livestock": { # temp until double cat name problem is solved
-            "category (IPCC1996_KOR_INV)": {
-                '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-                '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
-            }
-        }
+        "livestock": {  # temp until double cat name problem is solved
+            "category (IPCC1996_KOR_INV)": [
+                "4.B.1",
+                "4.B.10",
+                "4.B.2",
+                "4.B.3",
+                "4.B.4",
+                "4.B.5",
+                "4.B.6",
+                "4.B.7",
+                "4.B.8",
+                "4.B.9",
+            ]
+        },
     }
 
     filter_keep = {}
 
     meta_data = {
-        "references": "https://unfccc.int/documents/418616, http://www.gir.go.kr/home/file/readDownloadFile.do?fileId=4856&fileSeq=2",
+        "references": "https://unfccc.int/documents/418616, "
+        "http://www.gir.go.kr/home/file/readDownloadFile.do?"
+        "fileId=4856&fileSeq=2",
         "rights": "",
         "contact": "mail@johannes-guetschow.de.de",
-        "title": "Republic of Korea: BUR4 / National Greenhouse Gas Inventory Report 2020",
+        "title": "Republic of Korea: BUR4 / National Greenhouse Gas Inventory Report "
+        "2020",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
     }
@@ -126,11 +143,17 @@ if __name__ == "__main__":
 
     for sheet in sheets_to_read:
         # read current sheet (one sheet per gas)
-        df_current = pd.read_excel(input_folder / inventory_file, sheet_name=sheet, skiprows=3, nrows=144, usecols=cols_to_read,
-                                   engine="openpyxl")
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=144,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
-        df_current.dropna(axis=0, how='all', subset=index_cols, inplace=True)
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
         # set index. necessary for the stack operation in the conversion to long format
         # df_current = df_current.set_index(index_cols)
         # add columns
@@ -144,7 +167,7 @@ if __name__ == "__main__":
 
     df_all = df_all.reset_index(drop=True)
     # rename category col because filtering produces problems with korean col names
-    df_all.rename(columns={"분야·부문/연도": "category"}, inplace=True)
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
 
     # create copies of category col for further processing
     df_all["orig_cat_name"] = df_all["category"]
@@ -163,12 +186,12 @@ if __name__ == "__main__":
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
-        convert_str=True
-        )
+        convert_str=True,
+    )
 
     filter_data(data_if, filter_remove=filter_remove)
 
@@ -181,7 +204,12 @@ if __name__ == "__main__":
     # ###
     if not output_folder.exists():
         output_folder.mkdir()
-    pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"), encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Singapore/__init__.py

@@ -0,0 +1,30 @@
+"""Read Singapore's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'SGP'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=SGP
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 408 - 256
src/unfccc_ghg_data/unfccc_reader/Singapore/config_sgp_bur5.py

@@ -1,152 +1,222 @@
+"""Config for Singapore's BUR5
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
 table_def_templates = {
-    '66_1': {  # 66
-        "area": ['68,743,522,157'],
-        "cols": ['224,280,319,359,399,445,481'],
+    "66_1": {  # 66
+        "area": ["68,743,522,157"],
+        "cols": ["224,280,319,359,399,445,481"],
         "rows_to_fix": {
             # 2: ['and Sink Categories',],
-            3: ['1A2 Manufacturing Industries',
-                '1B3 Other Emissions from', '1C - Carbon Dioxide Transport',
-                '2 — INDUSTRIAL PROCESSES AND', '2D - Non-Energy Products from',
-                '2F - Product Uses as Substitutes for',
-                '2G - Other Product Manufacture'],
+            3: [
+                "1A2 Manufacturing Industries",
+                "1B3 Other Emissions from",
+                "1C - Carbon Dioxide Transport",
+                "2 — INDUSTRIAL PROCESSES AND",
+                "2D - Non-Energy Products from",
+                "2F - Product Uses as Substitutes for",
+                "2G - Other Product Manufacture",
+            ],
         },
     },
-    '66_2': {  # 66
-        "area": ['671,744,1117,265'],
-        "cols": ['824,875,912,954,996,1040,1082'],
+    "66_2": {  # 66
+        "area": ["671,744,1117,265"],
+        "cols": ["824,875,912,954,996,1040,1082"],
         "rows_to_fix": {
-            3: ['3 — AGRICULTURE, FORESTRY AND', '3C - Aggregate Sources and Non-CO2',
-                '4C - Incineration and Open Burning',
-                '4D -  Wastewater Treatment',
-                '5A - Indirect N2O emissions from the', 'CO2 from Biomass Combustion',
-                ],
+            3: [
+                "3 — AGRICULTURE, FORESTRY AND",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open Burning",
+                "4D -  Wastewater Treatment",
+                "5A - Indirect N2O emissions from the",
+                "CO2 from Biomass Combustion",
+            ],
         },
     },
-    '67_1': {  # 67
-        "area": ['70,727,554,159'],
-        "cols": ['207,254,291,319,356,400,442,468,503'],
+    "67_1": {  # 67
+        "area": ["70,727,554,159"],
+        "cols": ["207,254,291,319,356,400,442,468,503"],
         "rows_to_fix": {
-            2: ['2 — INDUSTRIAL PROCESSES', '2A4 Other Process Uses',
-                '2B4 Caprolactam, Glyoxal and', '2B8 Petrochemical and',
-                ],
-            3: ['Total National Emissions',
-                ],
+            2: [
+                "2 — INDUSTRIAL PROCESSES",
+                "2A4 Other Process Uses",
+                "2B4 Caprolactam, Glyoxal and",
+                "2B8 Petrochemical and",
+            ],
+            3: [
+                "Total National Emissions",
+            ],
         },
     },
-    '67_2': {  # 67
-        "area": ['666,725,1150,119'],
-        "cols": ['801,847,889,915,952,996,1036,1063,1098'],
+    "67_2": {  # 67
+        "area": ["666,725,1150,119"],
+        "cols": ["801,847,889,915,952,996,1036,1063,1098"],
         "rows_to_fix": {
-            2: ['2D - Non-Energy Products from', '2G - Other Product',
-                '2G2 SF6 and PFCs from', '2H2 Food and Beverages',
-                ],
-            3: ['Total National Emissions', '2E1 Integrated Circuit',
-                '2F - Product Uses as Substitutes for', '2F1 Refrigeration and',
-                ],
+            2: [
+                "2D - Non-Energy Products from",
+                "2G - Other Product",
+                "2G2 SF6 and PFCs from",
+                "2H2 Food and Beverages",
+            ],
+            3: [
+                "Total National Emissions",
+                "2E1 Integrated Circuit",
+                "2F - Product Uses as Substitutes for",
+                "2F1 Refrigeration and",
+            ],
         },
     },
-    '68_1': {  # 68
-        "area": ['66,787,524,217'],
-        "cols": ['205,261,315,366,415,473'],
+    "68_1": {  # 68
+        "area": ["66,787,524,217"],
+        "cols": ["205,261,315,366,415,473"],
         "rows_to_fix": {
-            2: ['2 — INDUSTRIAL PROCESSES', '2A4 Other Process Uses',
-                '2B4 Caprolactam, Glyoxal and', '2B8 Petrochemical and',
-                ],
-            3: ['Total National Emissions',
-                ],
+            2: [
+                "2 — INDUSTRIAL PROCESSES",
+                "2A4 Other Process Uses",
+                "2B4 Caprolactam, Glyoxal and",
+                "2B8 Petrochemical and",
+            ],
+            3: [
+                "Total National Emissions",
+            ],
         },
     },
-    '68_2': {  # 68
-        "area": ['666,787,1119,180'],
-        "cols": ['808,854,910,961,1017,1066'],
+    "68_2": {  # 68
+        "area": ["666,787,1119,180"],
+        "cols": ["808,854,910,961,1017,1066"],
         "rows_to_fix": {
-            2: ['2D - Non-Energy Products from',
-                '2F - Product Uses as Substitutes for', '2F1 Refrigeration and Air',
-                '2G2 SF6 and PFCs from Other', '2H2 Food and Beverages',
-                ],
-            3: ['Total National Emissions', '2E1 Integrated Circuit or',
-                '2G - Other Product Manufacture',
-                ],
+            2: [
+                "2D - Non-Energy Products from",
+                "2F - Product Uses as Substitutes for",
+                "2F1 Refrigeration and Air",
+                "2G2 SF6 and PFCs from Other",
+                "2H2 Food and Beverages",
+            ],
+            3: [
+                "Total National Emissions",
+                "2E1 Integrated Circuit or",
+                "2G - Other Product Manufacture",
+            ],
         },
     },
-    '84_1': {  # 84
-        "area": ['70,667,525,112'],
-        "cols": ['193,291,345,396,440,480'],
+    "84_1": {  # 84
+        "area": ["70,667,525,112"],
+        "cols": ["193,291,345,396,440,480"],
         "rows_to_fix": {},
     },
-    '84_2': {  # 84
-        "area": ['668,667,1115,83'],
-        "cols": ['854,908,954,1001,1038,1073'],
-        "rows_to_fix": { },
+    "84_2": {  # 84
+        "area": ["668,667,1115,83"],
+        "cols": ["854,908,954,1001,1038,1073"],
+        "rows_to_fix": {},
     },
-    '85_1': {  # 85
-        "area": ['70,680,531,170'],
-        "cols": ['275,328,375,414,456,489'],
+    "85_1": {  # 85
+        "area": ["70,680,531,170"],
+        "cols": ["275,328,375,414,456,489"],
         "rows_to_fix": {},
     },
-    '85_2': {  # 85
-        "area": ['663,675,1117,175'],
-        "cols": ['849,908,954,1001,1045,1073'],
+    "85_2": {  # 85
+        "area": ["663,675,1117,175"],
+        "cols": ["849,908,954,1001,1045,1073"],
         "rows_to_fix": {
-            3: ['3C — Aggregate Sources and Non-CO2',
-                '3C4 - Direct N2O Emissions from', '3C5 - Indirect N2O Emissions from',
-                '3C6 - Indirect N2O Emissions from']
+            3: [
+                "3C — Aggregate Sources and Non-CO2",
+                "3C4 - Direct N2O Emissions from",
+                "3C5 - Indirect N2O Emissions from",
+                "3C6 - Indirect N2O Emissions from",
+            ]
         },
     },
-    '92': {  # 92
-        "area": ['72,672,514,333'],
-        "cols": ['228,275,319,361,398,438,489'],
+    "92": {  # 92
+        "area": ["72,672,514,333"],
+        "cols": ["228,275,319,361,398,438,489"],
         "rows_to_fix": {
-            3: ['4A1 Managed Waste',
-                '4A2 Unmanaged Waste', '4A3 Uncategorised Waste',
-                '4C - Incineration and', '4D - Wastewater Treatment',
-                '4D1 Domestic Wastewater', '4D2 Industrial Wastewater']
+            3: [
+                "4A1 Managed Waste",
+                "4A2 Unmanaged Waste",
+                "4A3 Uncategorised Waste",
+                "4C - Incineration and",
+                "4D - Wastewater Treatment",
+                "4D1 Domestic Wastewater",
+                "4D2 Industrial Wastewater",
+            ]
         },
     },
-    '95_1': {  # 95
-        "area": ['70,731,507,149'],
-        "cols": ['233,307,375,452'],
+    "95_1": {  # 95
+        "area": ["70,731,507,149"],
+        "cols": ["233,307,375,452"],
         "drop_rows": [0, 1, 2, 3],
         "rows_to_fix": {
-            3: ['Total (Net)', '1A2 Manufacturing Industries',
-                '2 — INDUSTRIAL PROCESSES', '3 — AGRICULTURE, FORESTRY',
-                '3C - Aggregate Sources and Non-CO2', '4C - Incineration and Open',
-                'Clinical Waste', '4D - Wastewater Treatment',
-                'CO2 from Biomass Combustion for']
+            3: [
+                "Total (Net)",
+                "1A2 Manufacturing Industries",
+                "2 — INDUSTRIAL PROCESSES",
+                "3 — AGRICULTURE, FORESTRY",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open",
+                "Clinical Waste",
+                "4D - Wastewater Treatment",
+                "CO2 from Biomass Combustion for",
+            ]
         },
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories',
-                       'Net CO2', 'CH4', 'N2O', 'HFCs'],
-            'unit': ['', 'Gg', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "Net CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+            ],
+            "unit": ["", "Gg", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
     },
-    '95_2': {  # 95
-        "area": ['666,731,1103,149'],
-        "cols": ['829,903,971,1048'],
+    "95_2": {  # 95
+        "area": ["666,731,1103,149"],
+        "cols": ["829,903,971,1048"],
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "rows_to_fix": {
-            3: ['Total (Net)', '1A2 Manufacturing Industries',
-                '2 — INDUSTRIAL PROCESSES', '3 — AGRICULTURE, FORESTRY',
-                '3C - Aggregate Sources and Non-CO2', '4C - Incineration and Open',
-                'Clinical Waste', '4D - Wastewater Treatment',
-                'CO2 from Biomass Combustion for']
+            3: [
+                "Total (Net)",
+                "1A2 Manufacturing Industries",
+                "2 — INDUSTRIAL PROCESSES",
+                "3 — AGRICULTURE, FORESTRY",
+                "3C - Aggregate Sources and Non-CO2",
+                "4C - Incineration and Open",
+                "Clinical Waste",
+                "4D - Wastewater Treatment",
+                "CO2 from Biomass Combustion for",
+            ]
         },
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories',
-                       'PFCs', 'SF6', 'NF3', 'Total (Net) National Emissions'],
-            'unit': ['', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "PFCs",
+                "SF6",
+                "NF3",
+                "Total (Net) National Emissions",
+            ],
+            "unit": ["", "GgCO2eq", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
     },
 }
 
 table_defs = {
-    '66': {
-        "templates": ['66_1', '66_2'],
+    "66": {
+        "templates": ["66_1", "66_2"],
         # "header_rows": [0, 1],
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'Net CO2',
-                       'CH4', 'N2O', 'HFCs', 'PFCs', 'SF6', 'NF3'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq', 'GgCO2eq'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "Net CO2",
+                "CH4",
+                "N2O",
+                "HFCs",
+                "PFCs",
+                "SF6",
+                "NF3",
+            ],
+            "unit": ["", "Gg", "Gg", "Gg", "GgCO2eq", "GgCO2eq", "GgCO2eq", "GgCO2eq"],
         },
         "drop_rows": [0, 1, 2, 3],
         # "drop_cols": ['NF3', 'SF6'],
@@ -155,13 +225,22 @@ table_defs = {
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
     },
-    '67': {
-        "templates": ['67_1', '67_2'],
+    "67": {
+        "templates": ["67_1", "67_2"],
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'HFC-23', 'HFC-32',
-                       'HFC-41', 'HFC-125', 'HFC-134a', 'HFC-143a', 'HFC-152a',
-                       'HFC-227ea', 'HFC-43-10mee'],
-            'unit': ['', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "HFC-23",
+                "HFC-32",
+                "HFC-41",
+                "HFC-125",
+                "HFC-134a",
+                "HFC-143a",
+                "HFC-152a",
+                "HFC-227ea",
+                "HFC-43-10mee",
+            ],
+            "unit": ["", "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg"],
         },
         "drop_rows": [0, 1, 2, 3],
         # "drop_cols": ['NF3', 'SF6'],
@@ -170,24 +249,31 @@ table_defs = {
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018_fgases",
     },
-    '68': {
-        "templates": ['68_1', '68_2'],
+    "68": {
+        "templates": ["68_1", "68_2"],
         "header": {
-            'entity': ['Greenhouse Gas Source and Sink Categories', 'PFC-14',
-                       'PFC-116', 'PFC-218', 'PFC-318', 'SF6', 'NF3'],
-            'unit': ['', 'kg', 'kg', 'kg', 'kg', 'kg', 'kg'],
+            "entity": [
+                "Greenhouse Gas Source and Sink Categories",
+                "PFC-14",
+                "PFC-116",
+                "PFC-218",
+                "PFC-318",
+                "SF6",
+                "NF3",
+            ],
+            "unit": ["", "kg", "kg", "kg", "kg", "kg", "kg"],
         },
         "drop_rows": [0, 1, 2],
-         "category_col": "Greenhouse Gas Source and Sink Categories",
+        "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2018,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018_fgases",
     },
-    '84': {
-        "templates": ['84_1', '84_2'],
+    "84": {
+        "templates": ["84_1", "84_2"],
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "category_col": "Categories",
@@ -195,11 +281,11 @@ table_defs = {
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
     },
-    '85': {
-        "templates": ['85_1', '85_2'],
+    "85": {
+        "templates": ["85_1", "85_2"],
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         "drop_rows": [0, 1, 2, 3, 4, 5],
         "category_col": "Categories",
@@ -207,11 +293,11 @@ table_defs = {
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
     },
-    '92': {
-        "templates": ['92'],
+    "92": {
+        "templates": ["92"],
         "header": {
-            'entity': ['Categories', 'CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOC', 'SO2'],
-            'unit': ['', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg', 'Gg'],
+            "entity": ["Categories", "CO2", "CH4", "N2O", "NOx", "CO", "NMVOC", "SO2"],
+            "unit": ["", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg", "Gg"],
         },
         "drop_rows": [0, 1, 2],
         "category_col": "Categories",
@@ -219,43 +305,43 @@ table_defs = {
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "2018",
     },
-    '95': {
-        "templates": ['95_1', '95_2'],
+    "95": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2016,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
     },
-    '96': {
-        "templates": ['95_1', '95_2'],
+    "96": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2014,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
     },
-    '97': {
-        "templates": ['95_1', '95_2'],
+    "97": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2012,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
     },
-    '98': {
-        "templates": ['95_1', '95_2'],
+    "98": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2010,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
     },
-    '99': {
-        "templates": ['95_1', '95_2'],
+    "99": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 2000,
         # "unit_info": unit_info_2018,
         "coords_value_mapping": "other",
     },
-    '100': {
-        "templates": ['95_1', '95_2'],
+    "100": {
+        "templates": ["95_1", "95_2"],
         "category_col": "Greenhouse Gas Source and Sink Categories",
         "year": 1994,
         # "unit_info": unit_info_2018,
@@ -264,12 +350,12 @@ table_defs = {
 }
 
 cat_names_fix = {
-    '14Ab Residential': '1A4b Residential',
+    "14Ab Residential": "1A4b Residential",
 }
 
 values_replacement = {
-#    '': '-',
-    ' ': '',
+    #    '': '-',
+    " ": "",
 }
 
 gwp_to_use = "AR5GWP100"
@@ -281,28 +367,28 @@ unit_row = "header"
 
 ## parameters part 2: conversion to PRIMAP2 interchnage format
 
-cats_remove = ['Information items']
+cats_remove = ["Information items"]
 
 cat_codes_manual = {
-    'CO2 from Biomass Combustion for Energy Production': 'M.BIO',
-    'Total National Emissions and Removals': '0',
-    'Total (Net) National Emissions': '0',
-    'Clinical Waste Incineration': 'M.4.C.1',
-    'Hazardous Waste Incineration': 'M.4.C.2',
+    "CO2 from Biomass Combustion for Energy Production": "M.BIO",
+    "Total National Emissions and Removals": "0",
+    "Total (Net) National Emissions": "0",
+    "Clinical Waste Incineration": "M.4.C.1",
+    "Hazardous Waste Incineration": "M.4.C.2",
     #'3 AGRICULTURE': 'M.AG',
-    '3 AGRICULTURE, FORESTRY AND OTHER LAND USE': '3',
+    "3 AGRICULTURE, FORESTRY AND OTHER LAND USE": "3",
     #'3 LAND USE, LAND-USE CHANGE AND FORESTRY': 'M.LULUCF',
 }
 
 
-cat_code_regexp = r'(?P<code>^[A-Za-z0-9]{1,7})\s.*'
+cat_code_regexp = r"(?P<code>^[A-Za-z0-9]{1,7})\s.*"
 
 # special header as category code and name in one column
 header_long = ["orig_cat_name", "entity", "unit", "time", "data"]
 
 coords_terminologies = {
     "area": "ISO3",
-    "category": "IPCC2006_PRIMAP", #two extra categories
+    "category": "IPCC2006_PRIMAP",  # two extra categories
     "scenario": "PRIMAP",
 }
 
@@ -310,63 +396,59 @@ coords_defaults = {
     "source": "SGP-GHG-inventory ",
     "provenance": "measured",
     "area": "SGP",
-    "scenario": "BUR5"
+    "scenario": "BUR5",
 }
 
 coords_value_mapping = {
     "2018": {
         "unit": "PRIMAP1",
         "entity": {
-            'HFCs': f'HFCS ({gwp_to_use})',
-            'PFCs': f'PFCS ({gwp_to_use})',
-            'CH4': 'CH4',
-            'N2O': 'N2O',
-            'NF3': f'NF3 ({gwp_to_use})',
-            'Net CO2': 'CO2',
-            'SF6': f'SF6 ({gwp_to_use})',
-            'Total (Net) National Emissions': 'KYOTOGHG (AR5GWP100)',
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "CH4": "CH4",
+            "N2O": "N2O",
+            "NF3": f"NF3 ({gwp_to_use})",
+            "Net CO2": "CO2",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "Total (Net) National Emissions": "KYOTOGHG (AR5GWP100)",
         },
     },
     "2018_fgases": {
         "unit": "PRIMAP1",
         "entity": {
-            'HFC-125': 'HFC125',
-            'HFC-134a': 'HFC134a',
-            'HFC-143a': 'HFC143a',
-            'HFC-152a': 'HFC152a',
-            'HFC-227ea': 'HFC227ea',
-            'HFC-23': 'HFC23',
-            'HFC-32': 'HFC32',
-            'HFC-41': 'HFC41',
-            'HFC-43-10mee': 'HFC4310mee',
-            'NF3': 'NF3',
-            'PFC-116': 'C2F6',
-            'PFC-14': 'CF4',
-            'PFC-218': 'C3F8',
-            'PFC-318': 'cC4F8',
-            'SF6': 'SF6',
+            "HFC-125": "HFC125",
+            "HFC-134a": "HFC134a",
+            "HFC-143a": "HFC143a",
+            "HFC-152a": "HFC152a",
+            "HFC-227ea": "HFC227ea",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "NF3": "NF3",
+            "PFC-116": "C2F6",
+            "PFC-14": "CF4",
+            "PFC-218": "C3F8",
+            "PFC-318": "cC4F8",
+            "SF6": "SF6",
         },
     },
     "other": {
         "unit": "PRIMAP1",
         "entity": {
-            'HFCs': f'HFCS ({gwp_to_use})',
-            'CH4': f'CH4 ({gwp_to_use})',
-            'N2O': f'N2O ({gwp_to_use})',
-            'NF3': f'NF3 ({gwp_to_use})',
-            'Net CO2': 'CO2',
-            'PFCs': f'PFCS ({gwp_to_use})',
-            'SF6': f'SF6 ({gwp_to_use})',
-            'Total (Net) National Emissions': f'KYOTOGHG ({gwp_to_use})',
+            "HFCs": f"HFCS ({gwp_to_use})",
+            "CH4": f"CH4 ({gwp_to_use})",
+            "N2O": f"N2O ({gwp_to_use})",
+            "NF3": f"NF3 ({gwp_to_use})",
+            "Net CO2": "CO2",
+            "PFCs": f"PFCS ({gwp_to_use})",
+            "SF6": f"SF6 ({gwp_to_use})",
+            "Total (Net) National Emissions": f"KYOTOGHG ({gwp_to_use})",
         },
     },
 }
 
-coords_cols = {
-    "category": "category",
-    "entity": "entity",
-    "unit": "unit"
-}
+coords_cols = {"category": "category", "entity": "entity", "unit": "unit"}
 
 add_coords_cols = {
     "orig_cat_name": ["orig_cat_name", "category"],
@@ -386,7 +468,7 @@ meta_data = {
     "rights": "",
     "contact": "mail@johannes-guetschow.de",
     "title": "Singapore's Fifth National Communication and Fifth Biannial Update "
-             "Report",
+    "Report",
     "comment": "Read fom pdf file by Johannes Gütschow",
     "institution": "United Nations Framework Convention on Climate Change (UNFCCC)",
 }
@@ -394,92 +476,165 @@ meta_data = {
 
 ## processing
 aggregate_sectors = {
-    '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-          'name': 'IPPU'},
-    'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'], 'name': 'Emissions from Biomass Burning (Agriculture)'},
-    'M.3.C.1.LU': {'sources': ['3.C.1.a', '3.C.1.d'], 'name': 'Emissions from Biomass Burning (LULUCF)'},
-    'M.3.C.AG': {'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                             '3.C.6', '3.C.7', '3.C.8'],
-                 'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-    'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock emissions'},
-    'M.AG': {'sources': ['M.AG.ELV', '3.A'], 'name': 'Agriculture'},
-    'M.LULUCF': {'sources': ['M.3.C.1.LU', '3.B', '3.D'],
-                 'name': 'Land Use, Land Use Change, and Forestry'},
-    'M.0.EL': {'sources': ['1', '2', 'M.AG', '4', '5'], 'name': 'National Total Excluding LULUCF'},
-    '0': {'sources': ['1', '2', '3', '4', '5'], 'name': 'National Total'},
+    "2": {
+        "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+        "name": "IPPU",
+    },
+    "M.3.C.1.AG": {
+        "sources": ["3.C.1.b", "3.C.1.c"],
+        "name": "Emissions from Biomass Burning (Agriculture)",
+    },
+    "M.3.C.1.LU": {
+        "sources": ["3.C.1.a", "3.C.1.d"],
+        "name": "Emissions from Biomass Burning (LULUCF)",
+    },
+    "M.3.C.AG": {
+        "sources": [
+            "M.3.C.1.AG",
+            "3.C.2",
+            "3.C.3",
+            "3.C.4",
+            "3.C.5",
+            "3.C.6",
+            "3.C.7",
+            "3.C.8",
+        ],
+        "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+    },
+    "M.AG.ELV": {
+        "sources": ["M.3.C.AG"],
+        "name": "Agriculture excluding livestock emissions",
+    },
+    "M.AG": {"sources": ["M.AG.ELV", "3.A"], "name": "Agriculture"},
+    "M.LULUCF": {
+        "sources": ["M.3.C.1.LU", "3.B", "3.D"],
+        "name": "Land Use, Land Use Change, and Forestry",
+    },
+    "M.0.EL": {
+        "sources": ["1", "2", "M.AG", "4", "5"],
+        "name": "National Total Excluding LULUCF",
+    },
+    "0": {"sources": ["1", "2", "3", "4", "5"], "name": "National Total"},
 }
 
 
 processing_info_step1 = {
     # aggregate IPPU which is missing for individual fgases so it can be used in the
     # next step (downscaling)
-    'aggregate_cats': {
-        '2': {'sources': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-              'name': 'IPPU'},
+    "aggregate_cats": {
+        "2": {
+            "sources": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "name": "IPPU",
+        },
     },
-    'tolerance': 1, # because ch4 is inconsistent
+    "tolerance": 1,  # because ch4 is inconsistent
 }
 
-processing_info_step2 =  {
-    'aggregate_cats': aggregate_sectors,
-    'downscale': {
-        'sectors': {
-            'IPPU': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.E',
-                                    '2.F', '2.G', '2.H'],
-                'entities': ['CO2', 'N2O', f'PFCS ({gwp_to_use})',
-                             f'HFCS ({gwp_to_use})', 'SF6', 'NF3'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+processing_info_step2 = {
+    "aggregate_cats": aggregate_sectors,
+    "downscale": {
+        "sectors": {
+            "IPPU": {
+                "basket": "2",
+                "basket_contents": [
+                    "2.A",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.E",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                ],
+                "entities": [
+                    "CO2",
+                    "N2O",
+                    f"PFCS ({gwp_to_use})",
+                    f"HFCS ({gwp_to_use})",
+                    "SF6",
+                    "NF3",
+                ],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
             # AFOLU downscaling. Most is zero anyway
-            '3C': {
-                'basket': '3.C',
-                'basket_contents': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5',
-                                    '3.C.6', '3.C.7', '3.C.8'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3C": {
+                "basket": "3.C",
+                "basket_contents": [
+                    "3.C.1",
+                    "3.C.2",
+                    "3.C.3",
+                    "3.C.4",
+                    "3.C.5",
+                    "3.C.6",
+                    "3.C.7",
+                    "3.C.8",
+                ],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            '3C1': {
-                'basket': '3.C.1',
-                'basket_contents': ['3.C.1.a', '3.C.1.b', '3.C.1.c', '3.C.1.d'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3C1": {
+                "basket": "3.C.1",
+                "basket_contents": ["3.C.1.a", "3.C.1.b", "3.C.1.c", "3.C.1.d"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
-            '3D': {
-                'basket': '3.D',
-                'basket_contents': ['3.D.1', '3.D.2'],
-                'entities': ['CO2', 'CH4', 'N2O'],
-                'dim': 'category (IPCC2006_PRIMAP)',
+            "3D": {
+                "basket": "3.D",
+                "basket_contents": ["3.D.1", "3.D.2"],
+                "entities": ["CO2", "CH4", "N2O"],
+                "dim": "category (IPCC2006_PRIMAP)",
             },
         },
-        'entities': {
-            'HFCS': {
-                'basket': f'HFCS ({gwp_to_use})',
-                'basket_contents': ['HFC125', 'HFC134a', 'HFC143a', 'HFC23',
-                                    'HFC32', 'HFC4310mee', 'HFC227ea'],
-                'sel': {'category (IPCC2006_PRIMAP)':
-                            ['0', '2', '2.C', '2.E',
-                             '2.F', '2.G', '2.H']},
+        "entities": {
+            "HFCS": {
+                "basket": f"HFCS ({gwp_to_use})",
+                "basket_contents": [
+                    "HFC125",
+                    "HFC134a",
+                    "HFC143a",
+                    "HFC23",
+                    "HFC32",
+                    "HFC4310mee",
+                    "HFC227ea",
+                ],
+                "sel": {
+                    "category (IPCC2006_PRIMAP)": [
+                        "0",
+                        "2",
+                        "2.C",
+                        "2.E",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                    ]
+                },
             },
-            'PFCS': {
-                'basket': f'PFCS ({gwp_to_use})',
-                'basket_contents': ['C2F6', 'C3F8', 'CF4', 'cC4F8'],
-                'sel': {'category (IPCC2006_PRIMAP)':
-                            ['0', '2', '2.C', '2.E',
-                             '2.F', '2.G', '2.H']},
+            "PFCS": {
+                "basket": f"PFCS ({gwp_to_use})",
+                "basket_contents": ["C2F6", "C3F8", "CF4", "cC4F8"],
+                "sel": {
+                    "category (IPCC2006_PRIMAP)": [
+                        "0",
+                        "2",
+                        "2.C",
+                        "2.E",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                    ]
+                },
             },
-        }
+        },
     },
-    'remove_ts': {
-        'fgases': { # unnecessary and complicates aggregation for
+    "remove_ts": {
+        "fgases": {  # unnecessary and complicates aggregation for
             # other gases
-            'category': ['5', '5.B'],
-            'entities': [f'HFCS ({gwp_to_use})', f'PFCS ({gwp_to_use})', 'SF6', 'NF3'],
+            "category": ["5", "5.B"],
+            "entities": [f"HFCS ({gwp_to_use})", f"PFCS ({gwp_to_use})", "SF6", "NF3"],
         },
-        'CH4': { # inconsistent with IPPU sector
-            'category': ['2.A', '2.B', '2.C', '2.D', '2.E', '2.F', '2.G', '2.H'],
-            'entities': ['CH4'],
+        "CH4": {  # inconsistent with IPPU sector
+            "category": ["2.A", "2.B", "2.C", "2.D", "2.E", "2.F", "2.G", "2.H"],
+            "entities": ["CH4"],
         },
     },
     # 'basket_copy': {
@@ -488,6 +643,3 @@ processing_info_step2 =  {
     #     'source_GWP': gwp_to_use,
     # },
 }
-
-
-

+ 110 - 72
src/unfccc_ghg_data/unfccc_reader/Singapore/read_SGP_BUR5_from_pdf.py

@@ -1,12 +1,26 @@
-# read Singapore fifth BUR from pdf
+"""
+Read Singapore's BUR5 from pdf
 
+This script reads data from Singapore's BUR5
+Data are read from pdf using camelot
 
+"""
 import locale
 
-#import numpy as np
+# import numpy as np
 import camelot
 import pandas as pd
 import primap2 as pm2
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    gas_baskets,
+    process_data_for_country,
+)
+
 from .config_sgp_bur5 import (
     cat_code_regexp,
     cat_codes_manual,
@@ -26,29 +40,20 @@ from .config_sgp_bur5 import (
     table_defs,
     values_replacement,
 )
-from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    gas_baskets,
-    process_data_for_country,
-)
 
 if __name__ == "__main__":
     ### genral configuration
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Singapore' / 'BUR5'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Singapore'
+    input_folder = downloaded_data_path / "UNFCCC" / "Singapore" / "BUR5"
+    output_folder = extracted_data_path / "UNFCCC" / "Singapore"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'SGP_BUR5_2022_'
-    inventory_file_pdf = 'Singapore_-_NC5BUR5.pdf'
-    #years_to_read = range(1990, 2018 + 1)
+    output_filename = "SGP_BUR5_2022_"
+    inventory_file_pdf = "Singapore_-_NC5BUR5.pdf"
+    # years_to_read = range(1990, 2018 + 1)
 
     # define locale to use for str to float conversion
-    locale_to_use = 'en_SG.UTF-8'
+    locale_to_use = "en_SG.UTF-8"
     locale.setlocale(locale.LC_NUMERIC, locale_to_use)
 
     pagesToRead = table_defs.keys()
@@ -69,9 +74,14 @@ if __name__ == "__main__":
             print(f"Reading table {table_on_page}")
             area = table_def_templates[table_on_page]["area"]
             cols = table_def_templates[table_on_page]["cols"]
-            tables = camelot.read_pdf(str(input_folder / inventory_file_pdf),
-                                      pages=str(page), flavor='stream',
-                                      table_areas=area, columns=cols, split_text=True)
+            tables = camelot.read_pdf(
+                str(input_folder / inventory_file_pdf),
+                pages=str(page),
+                flavor="stream",
+                table_areas=area,
+                columns=cols,
+                split_text=True,
+            )
 
             df_current = tables[0].df.copy(deep=True)
             # drop the old header
@@ -79,39 +89,52 @@ if __name__ == "__main__":
                 df_current = df_current.drop(table_defs[page]["drop_rows"])
             elif "drop_rows" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(
-                    table_def_templates[table_on_page]["drop_rows"])
+                    table_def_templates[table_on_page]["drop_rows"]
+                )
             # add new header
-            if 'header' in table_defs[page].keys():
+            if "header" in table_defs[page].keys():
                 df_current.columns = pd.MultiIndex.from_tuples(
-                    zip(table_defs[page]['header']['entity'],
-                        table_defs[page]['header']['unit']))
+                    zip(
+                        table_defs[page]["header"]["entity"],
+                        table_defs[page]["header"]["unit"],
+                    )
+                )
             else:
                 df_current.columns = pd.MultiIndex.from_tuples(
-                    zip(table_def_templates[table_on_page]['header']['entity'],
-                        table_def_templates[table_on_page]['header']['unit']))
+                    zip(
+                        table_def_templates[table_on_page]["header"]["entity"],
+                        table_def_templates[table_on_page]["header"]["unit"],
+                    )
+                )
 
             # drop cols if necessary
             if "drop_cols" in table_defs[page].keys():
-                # print(df_current.columns.values)
+                # print(df_current.columns.to_numpy())
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
             elif "drop_cols" in table_def_templates[table_on_page].keys():
                 df_current = df_current.drop(columns=table_defs[page]["drop_cols"])
 
             # rename category column
-            df_current.rename(columns={table_defs[page]["category_col"]: index_cols[0]},
-                              inplace=True)
+            df_current = df_current.rename(
+                columns={table_defs[page]["category_col"]: index_cols[0]}
+            )
 
             # replace double \n
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("\n", " ")
             # replace double and triple spaces
-            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("   ", " ")
+            df_current[index_cols[0]] = df_current[index_cols[0]].str.replace(
+                "   ", " "
+            )
             df_current[index_cols[0]] = df_current[index_cols[0]].str.replace("  ", " ")
 
             # fix the split rows
             for n_rows in table_def_templates[table_on_page]["rows_to_fix"].keys():
-                df_current = fix_rows(df_current,
-                                      table_def_templates[table_on_page]["rows_to_fix"][
-                                          n_rows], index_cols[0], n_rows)
+                df_current = fix_rows(
+                    df_current,
+                    table_def_templates[table_on_page]["rows_to_fix"][n_rows],
+                    index_cols[0],
+                    n_rows,
+                )
 
             # replace category names with typos
             df_current[index_cols[0]] = df_current[index_cols[0]].replace(cat_names_fix)
@@ -122,7 +145,7 @@ if __name__ == "__main__":
             # set index
             # df_current = df_current.set_index(index_cols)
             # strip trailing and leading  and remove "^"
-            for col in df_current.columns.values:
+            for col in df_current.columns.to_numpy():
                 df_current[col] = df_current[col].str.strip()
                 df_current[col] = df_current[col].str.replace("^", "")
 
@@ -132,19 +155,24 @@ if __name__ == "__main__":
                 df_this_page = df_current.copy(deep=True)
             else:
                 # find intersecting cols
-                cols_this_page = df_this_page.columns.values
+                cols_this_page = df_this_page.columns.to_numpy()
                 # print(f"cols this page: {cols_this_page}")
-                cols_current = df_current.columns.values
+                cols_current = df_current.columns.to_numpy()
                 # print(f"cols current: {cols_current}")
                 cols_both = list(set(cols_this_page).intersection(set(cols_current)))
                 # print(f"cols both: {cols_both}")
                 if len(cols_both) > 0:
-                    df_this_page = df_this_page.merge(df_current, how='outer', on=cols_both,
-                                                      suffixes=(None, None))
+                    df_this_page = df_this_page.merge(
+                        df_current, how="outer", on=cols_both, suffixes=(None, None)
+                    )
                 else:
-                    df_this_page = df_this_page.merge(df_current, how='outer',
-                                                      left_index=True, right_index=True,
-                                                      suffixes=(None, None))
+                    df_this_page = df_this_page.merge(
+                        df_current,
+                        how="outer",
+                        left_index=True,
+                        right_index=True,
+                        suffixes=(None, None),
+                    )
 
                 df_this_page = df_this_page.groupby(index_cols).first().reset_index()
                 # print(df_this_page)
@@ -152,28 +180,34 @@ if __name__ == "__main__":
 
         # set index and convert to long format
         df_this_page = df_this_page.set_index(index_cols)
-        df_this_page_long = pm2.pm2io.nir_convert_df_to_long(df_this_page,
-                                                             table_defs[page]["year"],
-                                                             header_long)
+        df_this_page_long = pm2.pm2io.nir_convert_df_to_long(
+            df_this_page, table_defs[page]["year"], header_long
+        )
 
         # drop the rows with memo items etc
         for cat in cats_remove:
             df_this_page_long = df_this_page_long.drop(
-                df_this_page_long.loc[df_this_page_long.loc[:, index_cols[0]] == cat].index)
+                df_this_page_long.loc[
+                    df_this_page_long.loc[:, index_cols[0]] == cat
+                ].index
+            )
 
         # make a copy of the categories row
         df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, index_cols[0]]
 
         # replace cat names by codes in col "Categories"
         # first the manual replacements
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:, "category"].replace(
-            cat_codes_manual)
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].replace(cat_codes_manual)
+
         # then the regex repalcements
-        def repl(m):
-            return convert_ipcc_code_primap_to_primap2('IPC' + m.group('code'))
-        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[:,
-                                               "category"].str.replace(cat_code_regexp,
-                                                                       repl, regex=True)
+        def repl(m):  # noqa: D103
+            return convert_ipcc_code_primap_to_primap2("IPC" + m.group("code"))
+
+        df_this_page_long.loc[:, "category"] = df_this_page_long.loc[
+            :, "category"
+        ].str.replace(cat_code_regexp, repl, regex=True)
         df_this_page_long.loc[:, "category"].unique()
 
         # strip spaces in data col
@@ -185,27 +219,29 @@ if __name__ == "__main__":
         df_this_page_long.columns = df_this_page_long.columns.map(str)
 
         # remove thousands separators as pd.to_numeric can't deal with that
-        df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(',',
-                                                                                        '')
+        df_this_page_long.loc[:, "data"] = df_this_page_long.loc[:, "data"].str.replace(
+            ",", ""
+        )
 
         # drop orig cat name as it's not unique over all tables (keep until here in case
         # it's needed for debugging)
-        df_this_page_long = df_this_page_long.drop(columns='orig_cat_name')
+        df_this_page_long = df_this_page_long.drop(columns="orig_cat_name")
 
         data_page_if = pm2.pm2io.convert_long_dataframe_if(
             df_this_page_long,
             coords_cols=coords_cols,
-            #add_coords_cols=add_coords_cols,
+            # add_coords_cols=add_coords_cols,
             coords_defaults=coords_defaults,
             coords_terminologies=coords_terminologies,
             coords_value_mapping=coords_value_mapping[
-                table_defs[page]["coords_value_mapping"]],
+                table_defs[page]["coords_value_mapping"]
+            ],
             # coords_value_filling=coords_value_filling,
             filter_remove=filter_remove,
             # filter_keep=filter_keep,
             meta_data=meta_data,
             convert_str=True,
-            time_format='%Y',
+            time_format="%Y",
         )
 
         # conversion to PRIMAP2 native format
@@ -226,13 +262,16 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw"), data_if)
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
     encoding = {var: compression for var in data_pm2.data_vars}
     data_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
-
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     #### processing
     data_proc_pm2 = data_pm2
@@ -246,22 +285,21 @@ if __name__ == "__main__":
         processing_info_country=processing_info_step1,
     )
 
-
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         processing_info_country=processing_info_step2,
-        cat_terminology_out = terminology_proc,
-        #category_conversion = None,
-        #sectors_out = None,
+        cat_terminology_out=terminology_proc,
+        # category_conversion = None,
+        # sectors_out = None,
     )
 
     # adapt source and metadata
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -270,10 +308,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
-
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Taiwan/__init__.py

@@ -0,0 +1,30 @@
+"""Read Taiwan's inventories
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'TWN'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=TWN
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 194 - 120
src/unfccc_ghg_data/unfccc_reader/Taiwan/config_twn_nir2022.py

@@ -1,4 +1,10 @@
-# config and functions for Taiwan NIR 2022
+"""Config for Taiwan's 2022 inventory
+
+Partial configuration for camelot adn data aggregation. PRIMAP2 conversion
+config and metadata are define din the reading script
+
+"""
+
 
 from typing import Union
 
@@ -6,9 +12,36 @@ import pandas as pd
 
 gwp_to_use = "AR4GWP100"
 
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
+
+def fix_rows(
+    data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
+) -> pd.DataFrame:
+    """
+    Combine split rows
+
+    This function combines rows which have been split into several rows during data
+    reading from pdf because they contained line breaks.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        The data to work with
+    rows_to_fix: list
+        List of values for which to fix rows
+    col_to_use: str
+        column to use to find the rows to merge
+    n_rows: int
+        How many rows to combine for each row found. e.g. 3 means combine the found
+        row with the following two rows. Negative values are used for more
+        complicated situations where the rows to merge are also before the position
+        of the value that indicates the merge. See code for details
+
+    Returns
+    -------
+        pandas DataFrame with combined rows. The individual rows are removed
+    """
     for row in rows_to_fix:
-        #print(row)
+        # print(row)
         # find the row number and collect the row and the next two rows
         index = data.loc[data[col_to_use] == row].index
         if not list(index):
@@ -20,35 +53,35 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         for item in index:
             loc = data.index.get_loc(item)
             ####print(data[col_to_use].loc[loc + 1])
-            if n_rows == -2:
+            if n_rows == -2:  # noqa: PLR2004
                 locs_to_merge = list(range(loc - 1, loc + 1))
                 loc_to_check = loc - 1
-            #if n_rows == -3:
+            # if n_rows == -3:
             #    locs_to_merge = list(range(loc - 1, loc + 2))
-            #elif n_rows == -5:
+            # elif n_rows == -5:
             #    locs_to_merge = list(range(loc - 1, loc + 4))
             else:
                 locs_to_merge = list(range(loc, loc + n_rows))
                 loc_to_check = loc + 1
 
-            if data[col_to_use].loc[loc_to_check] == '':
+            if not data[col_to_use].loc[loc_to_check]:
                 rows_to_merge = data.iloc[locs_to_merge]
                 indices_to_merge = rows_to_merge.index
                 # replace numerical NaN values
                 ####print(rows_to_merge)
-                rows_to_merge = rows_to_merge.fillna('')
+                rows_to_merge = rows_to_merge.fillna("")
                 ####print("fillna")
                 ####print(rows_to_merge)
                 # join the three rows
-                new_row = rows_to_merge.agg(' '.join)
+                new_row = rows_to_merge.agg(" ".join)
                 # replace the double spaces that are created
                 # must be done here and not at the end as splits are not always
                 # the same and join would produce different col values
                 new_row = new_row.str.replace("  ", " ")
                 new_row = new_row.str.strip()
-                #new_row = new_row.str.replace("N O", "NO")
-                #new_row = new_row.str.replace(", N", ",N")
-                #new_row = new_row.str.replace("- ", "-")
+                # new_row = new_row.str.replace("N O", "NO")
+                # new_row = new_row.str.replace(", N", ",N")
+                # new_row = new_row.str.replace("- ", "-")
                 data.loc[indices_to_merge[0]] = new_row
                 indices_to_drop = indices_to_drop + list(indices_to_merge[1:])
 
@@ -56,12 +89,43 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
         data = data.reset_index(drop=True)
     return data
 
-def make_wide_table(data: pd.DataFrame, keyword: str, col: Union[int, str], index_cols: list[Union[int, str]])->pd.DataFrame:
+
+def make_wide_table(
+    data: pd.DataFrame,
+    keyword: str,
+    col: Union[int, str],
+    index_cols: list[Union[int, str]],
+) -> pd.DataFrame:
+    """
+    Transform a table with sections for gases to a gas-wide table
+
+    Some tables are rolled up, i.e. the header repeats within the table and the
+    tables are composed of several tables for different year ranges stacked on top of
+    each other. These tables are unrolled and converted to a proper time-wide format
+    without repetition of headers.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        the data to convert
+    keyword: str
+        The keyword used to identify the header, e.g. 'GHG Emission Sources and Sinks'
+    col: int
+        Column to look for the keyword
+    index_cols: list[Union[int, str]]
+        Columns to use as index for the output DataFrame
+
+
+    Returns
+    -------
+        pandas DataFrame in time-wide format
+
+    """
     index = data.loc[data[col] == keyword].index
     if not list(index):
         print("Keyword for table transformation not found")
         return data
-    elif len(index)==1:
+    elif len(index) == 1:
         print("Keyword for table transformation found only once")
         return data
     else:
@@ -74,83 +138,88 @@ def make_wide_table(data: pd.DataFrame, keyword: str, col: Union[int, str], inde
                 next_loc = data.index[-1] + 1
             df_to_add = data.loc[list(range(loc, next_loc))]
             # select only cols which don't have NaN, Null, or '' as header
-            filter_nan = ((~df_to_add.iloc[0].isnull()) & (df_to_add.iloc[0] != 'NaN')& (df_to_add.iloc[0] != ''))
-            df_to_add = df_to_add.loc[: , filter_nan]
+            filter_nan = (
+                (~df_to_add.iloc[0].isna())
+                & (df_to_add.iloc[0] != "NaN")
+                & (df_to_add.iloc[0])
+            )
+            df_to_add = df_to_add.loc[:, filter_nan]
             df_to_add.columns = df_to_add.iloc[0]
-            #print(df_to_add.columns)
+            # print(df_to_add.columns)
             df_to_add = df_to_add.drop(loc)
             df_to_add = df_to_add.set_index(index_cols)
 
             if df_all is None:
                 df_all = df_to_add
             else:
-                df_all = pd.concat([df_all, df_to_add], axis=1, join='outer')
+                df_all = pd.concat([df_all, df_to_add], axis=1, join="outer")
         return df_all
 
 
 # page defs tp hold information on reading the table
 page_defs = {
-    '5': {
-        "table_areas": ['36,523,563,68'],
+    "5": {
+        "table_areas": ["36,523,563,68"],
         "split_text": False,
         "flavor": "stream",
     },
-    '6': {
-        "table_areas": ['34,562,563,53'],
-        #"columns": ['195,228,263,295,328,363,395,428,462,495,529'], # works without
+    "6": {
+        "table_areas": ["34,562,563,53"],
+        # "columns": ['195,228,263,295,328,363,395,428,462,495,529'], # works without
         "split_text": True,
         "flavor": "stream",
     },
-    '7': {
-        "table_areas": ['36,740,499,482', '36,430,564,53'],
+    "7": {
+        "table_areas": ["36,740,499,482", "36,430,564,53"],
         "split_text": True,
         "flavor": "stream",
     },
-    '8': {
-        "table_areas": ['35,748,503,567'],
+    "8": {
+        "table_areas": ["35,748,503,567"],
         "split_text": True,
         "flavor": "stream",
     },
-    '9': {
-        "table_areas": ['35,747,565,315', '36,273,565,50'],
+    "9": {
+        "table_areas": ["35,747,565,315", "36,273,565,50"],
         "split_text": False,
         "flavor": "stream",
     },
-    '11': {
-        "table_areas": ['35,744,563,434'],
+    "11": {
+        "table_areas": ["35,744,563,434"],
         "split_text": True,
         "flavor": "stream",
     },
-    '12': {
-        "table_areas": ['33,747,562,86'],
+    "12": {
+        "table_areas": ["33,747,562,86"],
         "split_text": True,
         "flavor": "stream",
     },
-    '13': {
-        "table_areas": ['34,303,564,54'],
+    "13": {
+        "table_areas": ["34,303,564,54"],
         "split_text": True,
         "flavor": "stream",
     },
-    '14': {
-        "table_areas": ['34,754,564,256'],
-        "columns": ['220,251,283,314,344,371,406,438,470,500,530'],
+    "14": {
+        "table_areas": ["34,754,564,256"],
+        "columns": ["220,251,283,314,344,371,406,438,470,500,530"],
         "split_text": True,
         "flavor": "stream",
     },
-    '15': {
-        "table_areas": ['34,487,564,42'],
+    "15": {
+        "table_areas": ["34,487,564,42"],
         "split_text": True,
         "flavor": "stream",
     },
-    '16': {
-        "table_areas": ['34,418,564,125'],
-        #"columns": ['107,209,241,273,306,338,369,402,433,466,498,533'],
+    "16": {
+        "table_areas": ["34,418,564,125"],
+        # "columns": ['107,209,241,273,306,338,369,402,433,466,498,533'],
         "split_text": True,
         "flavor": "lattice",
-    }, # with stream the row index is messed up with lattice the column index ... red with lattice and fix col header manualy
-    '17': {
-        "table_areas": ['34,534,564,49'],
-        "columns": ['188,232,263,298,331,362,398,432,464,497,530'],
+    },  # with stream the row index is messed up with lattice the column index ...
+    # read with lattice and fix col header manually
+    "17": {
+        "table_areas": ["34,534,564,49"],
+        "columns": ["188,232,263,298,331,362,398,432,464,497,530"],
         "split_text": True,
         "flavor": "stream",
     },
@@ -158,38 +227,40 @@ page_defs = {
 
 # table defs to hold information on how to process the tables
 table_defs = {
-    'ES2.2': { # 1990-2020 Carbon Dioxide Emissions and Sequestration in Taiwan
+    "ES2.2": {  # 1990-2020 Carbon Dioxide Emissions and Sequestration in Taiwan
         "tables": [1, 2],
         "rows_to_fix": {
             0: {
-                3: ['1.A.4.c Agriculture, Forestry, Fishery, and',
-                    '2.D Non-Energy Products from Fuels and',
-                    '4. Land Use, Land Use Change and Forestry'],
+                3: [
+                    "1.A.4.c Agriculture, Forestry, Fishery, and",
+                    "2.D Non-Energy Products from Fuels and",
+                    "4. Land Use, Land Use Change and Forestry",
+                ],
             },
         },
-        "index_cols": ['GHG Emission Source and Sinks'],
-        "wide_keyword": 'GHG Emission Source and Sinks',
+        "index_cols": ["GHG Emission Source and Sinks"],
+        "wide_keyword": "GHG Emission Source and Sinks",
         "col_wide_kwd": 0,
         "entity": "CO2",
         "unit": "kt",
         "cat_codes_manual": {
-            'Net GHG Emission (including LULUCF)': '0',
-            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+            "Net GHG Emission (including LULUCF)": "0",
+            "Total GHG Emission (excluding LULUCF)": "M.0.EL",
         },
     },
-    'ES2.3': { # 1990-2020 Methane Emissions in Taiwan
+    "ES2.3": {  # 1990-2020 Methane Emissions in Taiwan
         "tables": [3, 4],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "entity": f"CH4 ({gwp_to_use})",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total Methane Emissions': '0',
+            "Total Methane Emissions": "0",
         },
     },
-    'ES2.4': { # 1990-2020 Nitrous Oxide Emissions in Taiwan
+    "ES2.4": {  # 1990-2020 Nitrous Oxide Emissions in Taiwan
         "tables": [5],
         "fix_cats": {
             0: {
@@ -197,33 +268,33 @@ table_defs = {
             },
         },
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "entity": f"N2O ({gwp_to_use})",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total Nitrous Oxide Emissions': '0',
+            "Total Nitrous Oxide Emissions": "0",
         },
     },
-    'ES3.1': { # 1990-2020 Greenhouse Gas Emission in Taiwan by Sector
+    "ES3.1": {  # 1990-2020 Greenhouse Gas Emission in Taiwan by Sector
         "tables": [7],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "entity": f"KYOTOGHG ({gwp_to_use})",
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Net GHG Emission (including LULUCF)': '0',
-            'Total GHG Emission (excluding LULUCF)': 'M.0.EL',
+            "Net GHG Emission (including LULUCF)": "0",
+            "Total GHG Emission (excluding LULUCF)": "M.0.EL",
         },
     },
-    'ES3.2': { # 1990-2020 Greenhouse Gas Emissions Produced by Energy Sector in Taiwan
+    "ES3.2": {  # 1990-2020 Greenhouse Gas Emissions Produced by Energy Sector in Taiwan
         "tables": [8],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "gas_splitting": {
             "Total CO2 Emission": "CO2",
@@ -234,17 +305,18 @@ table_defs = {
         },
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total CO2 Emission': '1',
-            'Total CH4 Emission': '1',
-            'Total N2O Emission': '1',
-            'Total Emission from Energy Sector': '1',
+            "Total CO2 Emission": "1",
+            "Total CH4 Emission": "1",
+            "Total N2O Emission": "1",
+            "Total Emission from Energy Sector": "1",
         },
     },
-    'ES3.3': { # 1990-2020 Greenhouse Gas Emissions Produced by Industrial Process and Product Use Sector (IPPU) in Taiwan
-        "tables": [9,10],
+    "ES3.3": {  # 1990-2020 Greenhouse Gas Emissions Produced by Industrial
+        # Process and Product Use Sector (IPPU) in Taiwan
+        "tables": [9, 10],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "gas_splitting": {
             "Total CO2 Emission": "CO2",
@@ -259,24 +331,26 @@ table_defs = {
         },
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total CO2 Emission': '2',
-            'Total CH4 Emission': '2',
-            'Total N2O Emission': '2',
-            'Total HFCs Emission': '2',
-            'Total PFCs Emission (2.E Electronics Industry)': '2.E',
-            'Total SF6 Emission': '2',
-            'Total NF3 Emission (2.E Electronics Industry)': '2.E',
-            'Total Emission from IPPU Sector': '2',
+            "Total CO2 Emission": "2",
+            "Total CH4 Emission": "2",
+            "Total N2O Emission": "2",
+            "Total HFCs Emission": "2",
+            "Total PFCs Emission (2.E Electronics Industry)": "2.E",
+            "Total SF6 Emission": "2",
+            "Total NF3 Emission (2.E Electronics Industry)": "2.E",
+            "Total Emission from IPPU Sector": "2",
         },
         "drop_rows": [
-            ("2.D Non-Energy Products from Fuels and Solvent Use", "CO2"), # has lower significant digits than in table ES2.2
-        ]
+            ("2.D Non-Energy Products from Fuels and Solvent Use", "CO2"),  # has lower
+            # significant digits than in table ES2.2
+        ],
     },
-    'ES3.4': { # 1990-2020 Greenhouse Gas Emissions Produced by Agriculture Sector in Taiwan
+    "ES3.4": {  # 1990-2020 Greenhouse Gas Emissions Produced by Agriculture Sector
+        # in Taiwan
         "tables": [11],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
         "gas_splitting": {
             "Total CO2 Emission (3.H Urea applied)": "CO2",
@@ -287,22 +361,22 @@ table_defs = {
         },
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total CO2 Emission (3.H Urea applied)': '3.H',
-            'Total CH4 Emission': '3',
-            'Total N2O Emission': '3',
-            'Total Emission From Agriculture Sector': '3',
+            "Total CO2 Emission (3.H Urea applied)": "3.H",
+            "Total CH4 Emission": "3",
+            "Total N2O Emission": "3",
+            "Total Emission From Agriculture Sector": "3",
         },
     },
-    'ES3.6': { # 1990-2020 Greenhouse Gas Emissions in Taiwan by Waste Sector
+    "ES3.6": {  # 1990-2020 Greenhouse Gas Emissions in Taiwan by Waste Sector
         "tables": [13],
         "rows_to_fix": {
             0: {
                 3: ["Total CO2 Emission"],
             },
         },
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
-        "col_wide_kwd": 0, # two column header
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
+        "col_wide_kwd": 0,  # two column header
         "gas_splitting": {
             "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "CO2",
             "Total CH4 Emission": f"CH4 ({gwp_to_use})",
@@ -312,51 +386,51 @@ table_defs = {
         },
         "unit": "ktCO2eq",
         "cat_codes_manual": {
-            'Total CO2 Emission (5.C Incineration and Open Burning of Waste)': '5.C',
-            'Total CH4 Emission': '5',
-            'Total N2O Emission': '5',
-            'Total Emission from Waste Sector': '5',
+            "Total CO2 Emission (5.C Incineration and Open Burning of Waste)": "5.C",
+            "Total CH4 Emission": "5",
+            "Total N2O Emission": "5",
+            "Total Emission from Waste Sector": "5",
         },
     },
 }
 
 table_defs_skip = {
-    'ES2.1': { # 1990-2020 Greenhouse Gas Emissions and Sequestration in Taiwan by Type
+    "ES2.1": {  # 1990-2020 Greenhouse Gas Emissions and Sequestration in Taiwan by Type
         "tables": [0],
         "rows_to_fix": {
             0: {
-                3: ['CO2'],
+                3: ["CO2"],
             },
             1: {  # wherte col 0 is empty
-                3: ['Net GHG Emission', 'Total GHG Emission'],
+                3: ["Net GHG Emission", "Total GHG Emission"],
             },
         },
-        "index_cols": ['GHG', 'GWP'],
-        "wide_keyword": 'GHG',
+        "index_cols": ["GHG", "GWP"],
+        "wide_keyword": "GHG",
         "col_wide_kwd": 0,
         "unit": "ktCO2eq",
     },
-    'ES2.5': { # 1990-2020 Fluoride-Containing Gas Emissions in Taiwan
+    "ES2.5": {  # 1990-2020 Fluoride-Containing Gas Emissions in Taiwan
         "tables": [6],
         "rows_to_fix": {
             0: {
-                -2: ['Total SF6 Emissions',
-                     'Total NF3 Emissions'],
+                -2: ["Total SF6 Emissions", "Total NF3 Emissions"],
             },
         },
-        "index_cols": ['GHG Emission Sources and Sinks'],
-        "wide_keyword": 'GHG Emission Sources and Sinks',
+        "index_cols": ["GHG Emission Sources and Sinks"],
+        "wide_keyword": "GHG Emission Sources and Sinks",
         "col_wide_kwd": 0,
-        #"entity": "CO2",
+        # "entity": "CO2",
         "unit": "ktCO2eq",
     },
-    'ES3.5': { # skip for now: 1990-2020 Changes in Carbon Sequestration by LULUCF Sector in Taiwan2],
+    "ES3.5": {  # skip for now: 1990-2020 Changes in Carbon Sequestration by LULUCF
+        # Sector in Taiwan2],
         "tables": [12],
         "rows_to_fix": {},
-        "index_cols": ['GHG Emission Sources and Sinks'], #header is merged col :-(
-        "wide_keyword": 'GHG Emission Sources and Sinks',
-        "col_wide_kwd": 0, # two column header
+        "index_cols": ["GHG Emission Sources and Sinks"],  # header is merged col :-(
+        "wide_keyword": "GHG Emission Sources and Sinks",
+        "col_wide_kwd": 0,  # two column header
         "unit": "kt",
         "entity": "CO2",
-    }, # need to consider the two columns specially (merge?)
+    },  # need to consider the two columns specially (merge?)
 }

+ 164 - 104
src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py

@@ -1,14 +1,24 @@
-# this script reads data from Taiwan's 2022 national inventory
-# Data is read from the english summary pdf
-# TODO: add further GWPs and gas baskets
+"""
+Read Taiwan's 2022 national inventory from pdf
+
+This script reads data from Taiwan's 2022 national inventory
+Data are read from the english summary pdf
+TODO: add further GWPs and gas baskets
+
+"""
 
 import copy
 
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_twn_nir2022 import (fix_rows, gwp_to_use, make_wide_table, page_defs,
-                                 table_defs)
+from config_twn_nir2022 import (
+    fix_rows,
+    gwp_to_use,
+    make_wide_table,
+    page_defs,
+    table_defs,
+)
 from primap2.pm2io._data_reading import matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
@@ -17,16 +27,16 @@ if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
+    input_folder = downloaded_data_path / "non-UNFCCC" / "Taiwan"
     # TODO: move file to subfolder
-    output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
+    output_folder = extracted_data_path / "non-UNFCCC" / "Taiwan"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    output_filename = 'TWN_inventory_2022_'
-    inventory_file = '00_abstract_en.pdf'
+    output_filename = "TWN_inventory_2022_"
+    inventory_file = "00_abstract_en.pdf"
 
-    cat_code_regexp = r'(?P<code>^[a-zA-Z0-9\.]{1,7})\s.*'
+    cat_code_regexp = r"(?P<code>^[a-zA-Z0-9\.]{1,7})\s.*"
 
     time_format = "%Y"
 
@@ -79,42 +89,49 @@ if __name__ == "__main__":
     # config for part3: mapping to 2006 categpries
 
     cat_mapping = {
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': '3.C.7',
-        '3.D': 'M.3.AS',
-        '3.F': '3.C.1.b',
-        '3.H': '3.C.3',
-        '4': 'M.LULUCF',
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        '5.D.1': '4.D.1',
-        '5.D.2': '4.D.2',
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "3.C.7",
+        "3.D": "M.3.AS",
+        "3.F": "3.C.1.b",
+        "3.H": "3.C.3",
+        "4": "M.LULUCF",
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "5.D.1": "4.D.1",
+        "5.D.2": "4.D.2",
     }
 
     aggregate_cats = {
-        '1.A': {'sources': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'name': 'Fuel Combustion Activities'},
-        '1.B': {'sources': ['1.B.1', '1.B.2'], 'name': 'Fugitive Emissions from Fuels'},
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['3.C.1.b'], 'name': 'Emissions from Biomass Burning'},
-        '3.C.5': {'sources': ['3.C.5.a', '3.C.5.b'],
-                  'name': 'Indirect N2O Emissions from Managed Soils'},
-        '3.C': {'sources': ['3.C.1', '3.C.3', 'M.3.AS', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
-        'M.AG.ELV': {'sources': ['3.C'],
-                     'name': 'Agriculture excluding livestock emissions'},
+        "1.A": {
+            "sources": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+            "name": "Fuel Combustion Activities",
+        },
+        "1.B": {"sources": ["1.B.1", "1.B.2"], "name": "Fugitive Emissions from Fuels"},
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {"sources": ["3.C.1.b"], "name": "Emissions from Biomass Burning"},
+        "3.C.5": {
+            "sources": ["3.C.5.a", "3.C.5.b"],
+            "name": "Indirect N2O Emissions from Managed Soils",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.3", "M.3.AS", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
+        "M.AG.ELV": {
+            "sources": ["3.C"],
+            "name": "Agriculture excluding livestock emissions",
+        },
     }
 
-
     # 2 for NF3, PFCs (from 2.E)
     aggregate_cats_NF3_PFC = {
-        '2': {'sources': ['2.E'], 'name': 'Industrial Process and Product Use Sector'},
+        "2": {"sources": ["2.E"], "name": "Industrial Process and Product Use Sector"},
     }
 
     compression = dict(zlib=True, complevel=9)
@@ -130,11 +147,10 @@ if __name__ == "__main__":
             str(input_folder / inventory_file),
             pages=page,
             **page_defs[page],
-            )
+        )
         for table in new_tables:
             all_tables.append(table.df)
 
-
     # ###
     # convert tables to primap2 format
     # ###
@@ -148,39 +164,49 @@ if __name__ == "__main__":
         if len(table_def["tables"]) > 1:
             for table in table_def["tables"][1:]:
                 df_this_table = pd.concat(
-                    [df_this_table, all_tables[table]],
-                    axis=0,
-                    join='outer')
+                    [df_this_table, all_tables[table]], axis=0, join="outer"
+                )
 
         # fix for table ES3.6
-        if table_name == 'ES3.6':
+        if table_name == "ES3.6":
             col_idx = df_this_table[0] == "Total CO Emission"
-            df_this_table.loc[col_idx, 1:] = ''
-            df_this_table.loc[col_idx, 0] = 'Total CO2 Emission'
+            df_this_table.loc[col_idx, 1:] = ""
+            df_this_table.loc[col_idx, 0] = "Total CO2 Emission"
 
         df_this_table = df_this_table.reset_index(drop=True)
 
         # fix categories if necessary
         if "fix_cats" in table_def.keys():
             for col in table_def["fix_cats"]:
-                df_this_table[col] = df_this_table[col].replace(table_def["fix_cats"][col])
+                df_this_table[col] = df_this_table[col].replace(
+                    table_def["fix_cats"][col]
+                )
 
         # fix rows
         for col in table_def["rows_to_fix"].keys():
             for n_rows in table_def["rows_to_fix"][col].keys():
                 print(f"Fixing {col}, {n_rows}")
                 # replace line breaks, long hyphens, double, and triple spaces in category names
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
-                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
-                df_this_table = fix_rows(df_this_table,
-                                         table_def["rows_to_fix"][col][n_rows], col, n_rows)
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "\n", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "   ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "  ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "-", "-"
+                )
+                df_this_table = fix_rows(
+                    df_this_table, table_def["rows_to_fix"][col][n_rows], col, n_rows
+                )
 
         # split by entity
         if "gas_splitting" in table_def.keys():
-            col_entity = [''] * len(df_this_table)
-            last_entity = ''
+            col_entity = [""] * len(df_this_table)
+            last_entity = ""
             for i in range(0, len(df_this_table)):
                 current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
                 if current_header in table_def["gas_splitting"].keys():
@@ -191,8 +217,12 @@ if __name__ == "__main__":
             table_def["index_cols"].append("entity")
 
         # make a wide table
-        df_this_table = make_wide_table(df_this_table, table_def["wide_keyword"],
-                                        table_def["col_wide_kwd"], table_def["index_cols"])
+        df_this_table = make_wide_table(
+            df_this_table,
+            table_def["wide_keyword"],
+            table_def["col_wide_kwd"],
+            table_def["index_cols"],
+        )
 
         if "drop_rows" in table_def.keys():
             df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
@@ -207,11 +237,12 @@ if __name__ == "__main__":
         # add unit
         df_this_table["unit"] = table_def["unit"]
 
-        df_this_table = df_this_table.rename({table_def["index_cols"][0]: "orig_cat_name"},
-                                             axis=1)
+        df_this_table = df_this_table.rename(
+            {table_def["index_cols"][0]: "orig_cat_name"}, axis=1
+        )
 
         # print(table_def["index_cols"][0])
-        # print(df_this_table.columns.values)
+        # print(df_this_table.columns.to_numpy())
 
         # make a copy of the categories row
         df_this_table["category"] = df_this_table["orig_cat_name"]
@@ -219,25 +250,30 @@ if __name__ == "__main__":
         # replace cat names by codes in col "category"
         # first the manual replacements
         df_this_table["category"] = df_this_table["category"].replace(
-            table_def["cat_codes_manual"])
+            table_def["cat_codes_manual"]
+        )
+
         # then the regex replacements
-        def repl(m):
-            return m.group('code')
-        df_this_table["category"] = df_this_table["category"].str.replace(cat_code_regexp,
-                                                                          repl, regex=True)
+        def repl(m):  # noqa: D103
+            return m.group("code")
+
+        df_this_table["category"] = df_this_table["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
 
         ### convert to PRIMAP2 IF
         # remove ','
-        time_format = '%Y'
+        time_format = "%Y"
         time_columns = [
             col
-            for col in df_this_table.columns.values
+            for col in df_this_table.columns.to_numpy()
             if matches_time_format(col, time_format)
         ]
 
         for col in time_columns:
-            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(',', '',
-                                                                              regex=False)
+            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
+                ",", "", regex=False
+            )
 
         # drop orig_cat_name as it's not unique per category
         df_this_table = df_this_table.drop(columns="orig_cat_name")
@@ -254,7 +290,7 @@ if __name__ == "__main__":
             # coords_value_filling=coords_value_filling,
             # filter_remove=filter_remove,
             # filter_keep=filter_keep,
-            meta_data=meta_data
+            meta_data=meta_data,
         )
 
         this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
@@ -267,7 +303,6 @@ if __name__ == "__main__":
     # convert back to IF to have units in the fixed format
     data_if = data_pm2.pr.to_interchange_format()
 
-
     # ###
     # convert to IPCC2006 categories
     # ###
@@ -275,31 +310,36 @@ if __name__ == "__main__":
     data_if_2006
     # filter_data(data_if_2006, filter_remove=filter_remove_IPCC2006)
     data_if_2006 = data_if_2006.replace(
-        {'category (IPCC2006_1996_Taiwan_Inv)': cat_mapping})
+        {"category (IPCC2006_1996_Taiwan_Inv)": cat_mapping}
+    )
 
     # rename the category col
-    data_if_2006.rename(
-        columns={'category (IPCC2006_1996_Taiwan_Inv)': 'category (IPCC2006_PRIMAP)'},
-        inplace=True)
-    data_if_2006.attrs['attrs']['cat'] = 'category (IPCC2006_PRIMAP)'
-    data_if_2006.attrs['dimensions']['*'] = [
-        'category (IPCC2006_PRIMAP)' if item == 'category (IPCC2006_1996_Taiwan_Inv)'
-        else item for item in data_if_2006.attrs['dimensions']['*']]
+    data_if_2006 = data_if_2006.rename(
+        columns={"category (IPCC2006_1996_Taiwan_Inv)": "category (IPCC2006_PRIMAP)"}
+    )
+    data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
+    data_if_2006.attrs["dimensions"]["*"] = [
+        "category (IPCC2006_PRIMAP)"
+        if item == "category (IPCC2006_1996_Taiwan_Inv)"
+        else item
+        for item in data_if_2006.attrs["dimensions"]["*"]
+    ]
 
     # aggregate categories
     for cat_to_agg in aggregate_cats:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats[cat_to_agg]["sources"])
+            aggregate_cats[cat_to_agg]["sources"]
+        )
         df_test = data_if_2006[mask]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -307,8 +347,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -324,19 +371,21 @@ if __name__ == "__main__":
     # aggregate categories
     for cat_to_agg in aggregate_cats_NF3_PFC:
         mask = data_if_2006["category (IPCC2006_PRIMAP)"].isin(
-            aggregate_cats_NF3_PFC[cat_to_agg]["sources"])
+            aggregate_cats_NF3_PFC[cat_to_agg]["sources"]
+        )
         mask_gas = data_if_2006["entity"].isin(
-            [f"NF3 ({gwp_to_use})", f"PFCS ({gwp_to_use})"])
+            [f"NF3 ({gwp_to_use})", f"PFCS ({gwp_to_use})"]
+        )
         df_test = data_if_2006[mask & mask_gas]
 
         if len(df_test) > 0:
             print(f"Aggregating category {cat_to_agg}")
             df_combine = df_test.copy(deep=True)
 
-            time_format = '%Y'
+            time_format = "%Y"
             time_columns = [
                 col
-                for col in df_combine.columns.values
+                for col in df_combine.columns.to_numpy()
                 if matches_time_format(col, time_format)
             ]
 
@@ -344,8 +393,15 @@ if __name__ == "__main__":
                 df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
 
             df_combine = df_combine.groupby(
-                by=['source', 'scenario (PRIMAP)', 'provenance', 'area (ISO3)', 'entity',
-                    'unit']).sum(min_count=1)
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
             # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
@@ -362,7 +418,7 @@ if __name__ == "__main__":
     data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
 
     # convert to mass units from CO2eq
-    entities_to_convert = ['N2O', 'SF6', 'CH4', 'NF3']
+    entities_to_convert = ["N2O", "SF6", "CH4", "NF3"]
     entities_to_convert = [f"{entity} ({gwp_to_use})" for entity in entities_to_convert]
 
     for entity in entities_to_convert:
@@ -382,19 +438,23 @@ if __name__ == "__main__":
     # save data
     # ###
     # data in original categories
-    pm2.pm2io.write_interchange_format(output_folder /
-                                       (output_filename + coords_terminologies["category"]),
-                                       data_if)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
     encoding = {var: compression for var in data_pm2.data_vars}
-    data_pm2.pr.to_netcdf((output_folder /
-                          (output_filename + coords_terminologies[
-                              "category"])).with_suffix(".nc"),
-                          encoding=encoding)
+    data_pm2.pr.to_netcdf(
+        (
+            output_folder / (output_filename + coords_terminologies["category"])
+        ).with_suffix(".nc"),
+        encoding=encoding,
+    )
 
     # data in 2006 categories
-    pm2.pm2io.write_interchange_format(output_folder /
-                                       (output_filename + "IPCC2006_PRIMAP"), data_if_2006)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_if_2006
+    )
     encoding = {var: compression for var in data_pm2_2006.data_vars}
-    data_pm2_2006.pr.to_netcdf((output_folder /
-                                (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
-                               encoding=encoding)
+    data_pm2_2006.pr.to_netcdf(
+        (output_folder / (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
+        encoding=encoding,
+    )

+ 30 - 0
src/unfccc_ghg_data/unfccc_reader/Thailand/__init__.py

@@ -0,0 +1,30 @@
+"""Read Thailand's BURs, NIRs, NCs
+
+Scripts and configurations to read Argentina's submissions to the UNFCCC.
+Currently, the following submissions and datasets are available (all datasets
+including DI (red using the DI-reader) and legacy BUR/NIR (no code)):
+
+.. exec_code::
+    :hide_code:
+
+    from unfccc_ghg_data.helper.functions import (get_country_datasets,
+                                                  get_country_submissions)
+    country = 'THA'
+    # print available submissions
+    print("="*15 + " Available submissions " + "="*15)
+    get_country_submissions(country, True)
+    print("")
+
+    #print available datasets
+    print("="*15 + " Available datasets " + "="*15)
+    get_country_datasets(country, True)
+
+You can also obtain this information running
+
+.. code-block:: bash
+
+    poetry run doit country_info country=THA
+
+See below for a listing of scripts for BUR/NIR reading including links.
+
+"""

+ 405 - 223
src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur3.py

@@ -1,38 +1,54 @@
-# configuration for Thailand, BUR4
+"""Config for Thailand's BUR4
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
+
+
 # ###
 # for reading
 # ###
 
 # general
 gwp_to_use = "AR4GWP100"
-terminology_proc = 'IPCC2006_PRIMAP'
+terminology_proc = "IPCC2006_PRIMAP"
 
-header_inventory = ['Greenhouse gas source and sink categories',
-                   'CO2 emissions', 'CO2 removals',
-                   'CH4', 'N2O', 'NOx', 'CO', 'NMVOCs',
-                   'SO2', 'HFCs', 'PFCs', 'SF6']
-unit_inventory = ['Gg'] * len(header_inventory)
+header_inventory = [
+    "Greenhouse gas source and sink categories",
+    "CO2 emissions",
+    "CO2 removals",
+    "CH4",
+    "N2O",
+    "NOx",
+    "CO",
+    "NMVOCs",
+    "SO2",
+    "HFCs",
+    "PFCs",
+    "SF6",
+]
+unit_inventory = ["Gg"] * len(header_inventory)
 unit_inventory[9] = "GgCO2eq"
 unit_inventory[10] = "GgCO2eq"
 
 # 2019 inventory
 inv_conf = {
-    'year': 2016,
-    'entity_row': 0,
-    'unit_row': 1,
-    'index_cols': "Greenhouse gas source and sink categories",
-    'header': header_inventory,
-    'unit': unit_inventory,
+    "year": 2016,
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
+    "header": header_inventory,
+    "unit": unit_inventory,
     # special header as category UNFCCC_GHG_data and name in one column
-    'header_long': ["orig_cat_name", "entity", "unit", "time", "data"],
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # # automatically with the other codes)
-    'cat_codes_manual': {
-        '6. Other Memo Items (not accounted in Total Emissions)': 'MEMO',
-        'International Bunkers': 'MBK',
-        'CO2 from Biomass': 'MBIO',
+    "cat_codes_manual": {
+        "6. Other Memo Items (not accounted in Total Emissions)": "MEMO",
+        "International Bunkers": "MBK",
+        "CO2 from Biomass": "MBIO",
     },
-    'cat_code_regexp': r'^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*',
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*",
 }
 
 # primap2 format conversion
@@ -59,14 +75,14 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'NMVOCs': 'NMVOC',
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
     },
 }
 
 filter_remove = {
-    'f_memo': {"category": "MEMO"},
+    "f_memo": {"category": "MEMO"},
 }
 filter_keep = {}
 
@@ -81,26 +97,31 @@ meta_data = {
 
 # main sector time series
 header_main_sector_ts = [
-    'Year', 'Energy', 'IPPU',
-    'Agriculture', 'LULUCF', 'Waste',
-    'Net emissions (Including LULUCF)',
-    'Net emissions (Excluding LULUCF)']
-unit_main_sector_ts = ['GgCO2eq'] * len(header_main_sector_ts)
-unit_main_sector_ts[0] = ''
+    "Year",
+    "Energy",
+    "IPPU",
+    "Agriculture",
+    "LULUCF",
+    "Waste",
+    "Net emissions (Including LULUCF)",
+    "Net emissions (Excluding LULUCF)",
+]
+unit_main_sector_ts = ["GgCO2eq"] * len(header_main_sector_ts)
+unit_main_sector_ts[0] = ""
 
 trend_conf = {
-    'header': header_main_sector_ts,
-    'unit': unit_main_sector_ts,
+    "header": header_main_sector_ts,
+    "unit": unit_main_sector_ts,
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # automatically with the other codes)
-    'cat_codes_manual': {
-        'Energy': "1",
-        'IPPU': "2",
-        'Agriculture': "3",
-        'LULUCF': "4",
-        'Waste': "5",
-        'Net emissions (Including LULUCF)': "0",
-        'Net emissions (Excluding LULUCF)': "M0EL",
+    "cat_codes_manual": {
+        "Energy": "1",
+        "IPPU": "2",
+        "Agriculture": "3",
+        "LULUCF": "4",
+        "Waste": "5",
+        "Net emissions (Including LULUCF)": "0",
+        "Net emissions (Excluding LULUCF)": "M0EL",
     },
 }
 
@@ -118,14 +139,13 @@ coords_defaults_main_sector_ts = {
 }
 
 # indirect gases time series
-header_indirect = ['Year', 'NOx', 'CO',
-                    'NMVOCs', 'SO2']
-unit_indirect = ['Gg'] * len(header_indirect)
-unit_indirect[0] = ''
+header_indirect = ["Year", "NOx", "CO", "NMVOCs", "SO2"]
+unit_indirect = ["Gg"] * len(header_indirect)
+unit_indirect[0] = ""
 ind_conf = {
-    'header': header_indirect,
-    'unit': unit_indirect,
-    'cols_to_remove': ['Average Annual Growth Rate'],
+    "header": header_indirect,
+    "unit": unit_indirect,
+    "cols_to_remove": ["Average Annual Growth Rate"],
 }
 
 coords_cols_indirect = {
@@ -146,111 +166,203 @@ coords_defaults_indirect = {
 # ###
 # aggregate categories
 country_processing_step1 = {
-    'aggregate_cats': {
-        '2.A.4': {'sources': ['2.A.4.b', '2.A.4.d'],
-                  'name': 'Other Process uses of Carbonates'},
+    "aggregate_cats": {
+        "2.A.4": {
+            "sources": ["2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates",
+        },
     },
-    'aggregate_gases': {
-        'KYOTOGHG': {
-            'basket': 'KYOTOGHG (AR4GWP100)',
-            'basket_contents': ['CO2', 'CH4', 'N2O', 'SF6',
-                                'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)'],
-            'skipna': True,
-            'min_count': 1,
-            'sel': {f'category ({coords_terminologies["category"]})':
-                [
-                    '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                    '1.A.4', '1.B', '1.B.1', '1.B.2',
-                    '1.C',
-                    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                    '2.B', '2.C', '2.D', '2.H',
-                    '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                    '3.H', '3.I',
-                    '4', '4.A', '4.B', '4.C', '4.D', '4.E',
-                    '5', '5.A', '5.B', '5.C', '5.D'
+    "aggregate_gases": {
+        "KYOTOGHG": {
+            "basket": "KYOTOGHG (AR4GWP100)",
+            "basket_contents": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "HFCS (AR4GWP100)",
+                "PFCS (AR4GWP100)",
+            ],
+            "skipna": True,
+            "min_count": 1,
+            "sel": {
+                f'category ({coords_terminologies["category"]})': [
+                    "0",
+                    "1",
+                    "1.A",
+                    "1.A.1",
+                    "1.A.2",
+                    "1.A.3",
+                    "1.A.4",
+                    "1.B",
+                    "1.B.1",
+                    "1.B.2",
+                    "1.C",
+                    "2",
+                    "2.A",
+                    "2.A.1",
+                    "2.A.2",
+                    "2.A.3",
+                    "2.A.4",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.H",
+                    "3",
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                    "4",
+                    "4.A",
+                    "4.B",
+                    "4.C",
+                    "4.D",
+                    "4.E",
+                    "5",
+                    "5.A",
+                    "5.B",
+                    "5.C",
+                    "5.D",
                 ]
-            }, # not tested
+            },  # not tested
         },
     },
 }
 
 country_processing_step2 = {
-    'downscale': {
+    "downscale": {
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '2': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.H'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2": {
+                "basket": "2",
+                "basket_contents": ["2.A", "2.B", "2.C", "2.D", "2.H"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '2.A': {
-                'basket': '2.A',
-                'basket_contents': ['2.A.1', '2.A.2', '2.A.3', '2.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2.A": {
+                "basket": "2.A",
+                "basket_contents": ["2.A.1", "2.A.2", "2.A.3", "2.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                                    '3.H', '3.I'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "3": {
+                "basket": "3",
+                "basket_contents": [
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                ],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '4': {
-                'basket': '4',
-                'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4": {
+                "basket": "4",
+                "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '5': {
-                'basket': '5',
-                'basket_contents': ['5.A', '5.B', '5.C', '5.D'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "5": {
+                "basket": "5",
+                "basket_contents": ["5.A", "5.B", "5.C", "5.D"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
         },
-        'entities': {
-            'KYOTO': {
-                'basket': 'KYOTOGHG (AR4GWP100)',
-                'basket_contents': ['CH4', 'CO2', 'N2O', 'HFCS (AR4GWP100)',
-                                    'PFCS (AR4GWP100)', 'SF6'],
-                'sel': {f'category ({coords_terminologies["category"]})':
-                    [
-                        '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                        '1.A.4', '1.B', '1.B.1', '1.B.2', '1.C',
-                        '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                        '2.B', '2.C', '2.D', '2.H',
-                        '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                        '3.H', '3.I',
-                        '4', '4.A', '4.B', '4.C', '4.D', '4.E',
-                        '5', '5.A', '5.B', '5.C', '5.D']},
+        "entities": {
+            "KYOTO": {
+                "basket": "KYOTOGHG (AR4GWP100)",
+                "basket_contents": [
+                    "CH4",
+                    "CO2",
+                    "N2O",
+                    "HFCS (AR4GWP100)",
+                    "PFCS (AR4GWP100)",
+                    "SF6",
+                ],
+                "sel": {
+                    f'category ({coords_terminologies["category"]})': [
+                        "0",
+                        "1",
+                        "1.A",
+                        "1.A.1",
+                        "1.A.2",
+                        "1.A.3",
+                        "1.A.4",
+                        "1.B",
+                        "1.B.1",
+                        "1.B.2",
+                        "1.C",
+                        "2",
+                        "2.A",
+                        "2.A.1",
+                        "2.A.2",
+                        "2.A.3",
+                        "2.A.4",
+                        "2.B",
+                        "2.C",
+                        "2.D",
+                        "2.H",
+                        "3",
+                        "3.A",
+                        "3.B",
+                        "3.C",
+                        "3.D",
+                        "3.E",
+                        "3.F",
+                        "3.G",
+                        "3.H",
+                        "3.I",
+                        "4",
+                        "4.A",
+                        "4.B",
+                        "4.C",
+                        "4.D",
+                        "4.E",
+                        "5",
+                        "5.A",
+                        "5.B",
+                        "5.C",
+                        "5.D",
+                    ]
+                },
             },
         },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
 }
 ## not in BUR3: 1.A.1.a, 1.A.1.b, 1.A.3.a, 1.A.3.b, 1.A.3.c, 1.A.3.d, 1.A.5, 1.B.3,
@@ -258,106 +370,176 @@ country_processing_step2 = {
 # 4.E.x, 5.X.y M.BK.A, M.BK.M
 
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        'M.0.EL': 'M.0.EL',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.4': '1.A.4',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '1.C': '1.C',
-        '1.C.1': '1.C.1',
-        '1.C.2': '1.C.2',
-        '1.C.3': '1.C.3',
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1',
-        '2.A.2': '2.A.2',
-        '2.A.3': '2.A.3',
-        '2.A.4': '2.A.4',
-        '2.A.4.b': '2.A.4.b',
-        '2.A.4.d': '2.A.4.d',
-        '2.B': '2.B',
-        '2.C': '2.C',
-        '2.C.1': '2.C.1',
-        '2.D': '2.D',
-        '2.D.1': '2.D.1',
-        '2.H': '2.H',
-        '2.H.1': '2.H.1',
-        '2.H.2': '2.H.2',
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': 'M.3.C.1.AG',  # field burning of agricultural residues
-        '3.D': '3.C.2',  # Liming
-        '3.E': '3.C.3',  # urea application
-        '3.F': '3.C.4',  # direct N2O from agri soils
-        '3.G': '3.C.5',  # indirect N2O from agri soils
-        '3.H': '3.C.6',  # indirect N2O from manure management
-        '3.I': '3.C.7',  # rice
-        '4': 'M.LULUCF',
-        '4.A': '3.B.1.a',  # forest remaining forest
-        '4.B': '3.B.2.a',  # cropland remaining cropland
-        '4.C': '3.B.2.b',  # land converted to cropland
-        '4.D': '3.B.6.b',  # land converted to other land
-        '4.E': 'M.3.C.1.LU',  # biomass burning (LULUCF)
-        '5': '4',
-        '5.A': '4.A',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.D': '4.D',
-        'M.BK': 'M.BK',
-        'M.BIO': 'M.BIO',
+    "mapping": {
+        "0": "0",
+        "M.0.EL": "M.0.EL",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.4": "1.A.4",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "1.C": "1.C",
+        "1.C.1": "1.C.1",
+        "1.C.2": "1.C.2",
+        "1.C.3": "1.C.3",
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",
+        "2.A.2": "2.A.2",
+        "2.A.3": "2.A.3",
+        "2.A.4": "2.A.4",
+        "2.A.4.b": "2.A.4.b",
+        "2.A.4.d": "2.A.4.d",
+        "2.B": "2.B",
+        "2.C": "2.C",
+        "2.C.1": "2.C.1",
+        "2.D": "2.D",
+        "2.D.1": "2.D.1",
+        "2.H": "2.H",
+        "2.H.1": "2.H.1",
+        "2.H.2": "2.H.2",
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "M.3.C.1.AG",  # field burning of agricultural residues
+        "3.D": "3.C.2",  # Liming
+        "3.E": "3.C.3",  # urea application
+        "3.F": "3.C.4",  # direct N2O from agri soils
+        "3.G": "3.C.5",  # indirect N2O from agri soils
+        "3.H": "3.C.6",  # indirect N2O from manure management
+        "3.I": "3.C.7",  # rice
+        "4": "M.LULUCF",
+        "4.A": "3.B.1.a",  # forest remaining forest
+        "4.B": "3.B.2.a",  # cropland remaining cropland
+        "4.C": "3.B.2.b",  # land converted to cropland
+        "4.D": "3.B.6.b",  # land converted to other land
+        "4.E": "M.3.C.1.LU",  # biomass burning (LULUCF)
+        "5": "4",
+        "5.A": "4.A",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.D": "4.D",
+        "M.BK": "M.BK",
+        "M.BIO": "M.BIO",
     },
-    'aggregate': {
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1': {'sources': ['M.3.C.1.AG', 'M.3.C.1.LU'],
-                  'name': 'Emissions from Biomass Burning'},
-        '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {
-            'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        'M.3.C.LU': {'sources': ['M.3.C.1.LU'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land (Land use)'},
-        '3.B.1': {'sources': ['3.B.1.a'], 'name': 'Forest Land'},
-        '3.B.2': {'sources': ['3.B.2.a', '3.B.2.b'], 'name': 'Cropland'},
-        '3.B.6': {'sources': ['3.B.6.b'], 'name': 'Other Land'},
-        '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.6'], 'name': 'Land'},
-        'M.LULUCF': {'sources': ['3.B', 'N.3.C.LU'], 'name': 'LULUCF'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "aggregate": {
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1": {
+            "sources": ["M.3.C.1.AG", "M.3.C.1.LU"],
+            "name": "Emissions from Biomass Burning",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.2", "3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": [
+                "M.3.C.1.AG",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.3.C.LU": {
+            "sources": ["M.3.C.1.LU"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Land use)",
+        },
+        "3.B.1": {"sources": ["3.B.1.a"], "name": "Forest Land"},
+        "3.B.2": {"sources": ["3.B.2.a", "3.B.2.b"], "name": "Cropland"},
+        "3.B.6": {"sources": ["3.B.6.b"], "name": "Other Land"},
+        "3.B": {"sources": ["3.B.1", "3.B.2", "3.B.6"], "name": "Land"},
+        "M.LULUCF": {"sources": ["3.B", "N.3.C.LU"], "name": "LULUCF"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
 }
 
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.2', '1.A.3', '1.A.4',
-    '1.B', '1.B.1', '1.B.2', '1.C', '1.C.1', '1.C.2', '1.C.3',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.4.b', '2.A.4.d',
-    '2.B', '2.C', '2.C.1', '2.H', '2.H.1', '2.H.2',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', '3.C.2', '3.C.3', '3.C.4',
-    '3.C.5', '3.C.6', '3.C.7', 'M.3.C.1.AG', 'M.3.C.AG', 'M.AG.ELV',
-    'M.LULUCF', 'M.3.C.1.LU', 'M.3.C.LU', '3.B', '3.B.1', '3.B.1.a', '3.B.2', '3.B.2.a',
-    '3.B.2.b', '3.B.6', '3.B.6.b',
-    '4', '4.A', '4.B', '4.C', '4.D',
-    '0', 'M.0.EL', 'M.BK', 'M.BIO']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.2",
+    "1.A.3",
+    "1.A.4",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "1.C",
+    "1.C.1",
+    "1.C.2",
+    "1.C.3",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.4.b",
+    "2.A.4.d",
+    "2.B",
+    "2.C",
+    "2.C.1",
+    "2.H",
+    "2.H.1",
+    "2.H.2",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "3.C.2",
+    "3.C.3",
+    "3.C.4",
+    "3.C.5",
+    "3.C.6",
+    "3.C.7",
+    "M.3.C.1.AG",
+    "M.3.C.AG",
+    "M.AG.ELV",
+    "M.LULUCF",
+    "M.3.C.1.LU",
+    "M.3.C.LU",
+    "3.B",
+    "3.B.1",
+    "3.B.1.a",
+    "3.B.2",
+    "3.B.2.a",
+    "3.B.2.b",
+    "3.B.6",
+    "3.B.6.b",
+    "4",
+    "4.A",
+    "4.B",
+    "4.C",
+    "4.D",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BIO",
+]
 
 
 # gas baskets
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }

+ 461 - 250
src/unfccc_ghg_data/unfccc_reader/Thailand/config_tha_bur4.py

@@ -1,31 +1,35 @@
-# configuration for Thailand, BUR4
+"""Config for Thailand's BUR5
+
+Full configuration including PRIMAP2 conversion config and metadata
+
+"""
 # ###
 # for reading
 # ###
 
 # general
 gwp_to_use = "AR4GWP100"
-terminology_proc = 'IPCC2006_PRIMAP'
+terminology_proc = "IPCC2006_PRIMAP"
 
 # 2019 inventory
 inv_conf = {
-    'year': 2019,
-    'entity_row': 0,
-    'unit_row': 1,
-    'index_cols': "Greenhouse gas source and sink categories",
+    "year": 2019,
+    "entity_row": 0,
+    "unit_row": 1,
+    "index_cols": "Greenhouse gas source and sink categories",
     # special header as category UNFCCC_GHG_data and name in one column
-    'header_long': ["orig_cat_name", "entity", "unit", "time", "data"],
+    "header_long": ["orig_cat_name", "entity", "unit", "time", "data"],
     # manual category codes (manual mapping to primap1, will be mapped to primap2
     # # automatically with the other codes)
-    'cat_codes_manual': {
-        'Total national emissions and removals': '0',
-        'Memo Items (not accounted in total Emissions)': 'MEMO',
-        'International Bunkers': 'MBK',
-        'Aviation International Bunkers': 'MBKA',
-        'Marine-International Bunkers': 'MBKM',
-        'CO2 from biomass': 'MBIO',
+    "cat_codes_manual": {
+        "Total national emissions and removals": "0",
+        "Memo Items (not accounted in total Emissions)": "MEMO",
+        "International Bunkers": "MBK",
+        "Aviation International Bunkers": "MBKA",
+        "Marine-International Bunkers": "MBKM",
+        "CO2 from biomass": "MBIO",
     },
-    'cat_code_regexp': r'^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*',
+    "cat_code_regexp": r"^(?P<code>[a-zA-Z0-9]{1,4})[\s\.].*",
 }
 
 # primap2 format conversion
@@ -52,16 +56,16 @@ coords_value_mapping = {
     "unit": "PRIMAP1",
     "category": "PRIMAP1",
     "entity": {
-        'HFCs': f"HFCS ({gwp_to_use})",
-        'PFCs': f"PFCS ({gwp_to_use})",
-        'SF6': f'SF6 ({gwp_to_use})',
-        'NMVOCs': 'NMVOC',
-        'Nox': 'NOx',
+        "HFCs": f"HFCS ({gwp_to_use})",
+        "PFCs": f"PFCS ({gwp_to_use})",
+        "SF6": f"SF6 ({gwp_to_use})",
+        "NMVOCs": "NMVOC",
+        "Nox": "NOx",
     },
 }
 
 filter_remove = {
-    'f_memo': {"category": "MEMO"},
+    "f_memo": {"category": "MEMO"},
 }
 filter_keep = {}
 
@@ -78,13 +82,13 @@ meta_data = {
 # manual category codes (manual mapping to primap1, will be mapped to primap2
 # automatically with the other codes)
 cat_codes_manual_main_sector_ts = {
-    'Energy': "1",
-    'Industrial Processes and Product Use': "2",
-    'Agriculture': "3",
-    'LULUCF': "4",
-    'Waste': "5",
-    'Net emissions (Include LULUCF)': "0",
-    'Total emissions (Exclude LULUCF)': "M0EL",
+    "Energy": "1",
+    "Industrial Processes and Product Use": "2",
+    "Agriculture": "3",
+    "LULUCF": "4",
+    "Waste": "5",
+    "Net emissions (Include LULUCF)": "0",
+    "Total emissions (Exclude LULUCF)": "M0EL",
 }
 
 coords_cols_main_sector_ts = {
@@ -119,263 +123,470 @@ coords_defaults_indirect = {
 # ###
 # aggregate categories
 country_processing_step1 = {
-    'aggregate_cats': {
-        '2.A.4': {'sources': ['2.A.4.b', '2.A.4.d'],
-                  'name': 'Other Process uses of Carbonates'},
-        '2.B.8': {'sources': ['2.B.8.b', '2.B.8.c', '2.B.8.e', '2.B.8.f'],
-                  'name': 'Petrochemical and Carbon Black production'},
+    "aggregate_cats": {
+        "2.A.4": {
+            "sources": ["2.A.4.b", "2.A.4.d"],
+            "name": "Other Process uses of Carbonates",
+        },
+        "2.B.8": {
+            "sources": ["2.B.8.b", "2.B.8.c", "2.B.8.e", "2.B.8.f"],
+            "name": "Petrochemical and Carbon Black production",
+        },
     },
-    'aggregate_gases': {
-        'KYOTOGHG': {
-            'basket': 'KYOTOGHG (AR4GWP100)',
-            'basket_contents': ['CO2', 'CH4', 'N2O', 'SF6',
-                                'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)'],
-            'skipna': True,
-            'min_count': 1,
-            'sel': {f'category ({coords_terminologies["category"]})':
-                [
-                    '0', '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                    '1.A.4', '1.A.5', '1.B', '1.B.1', '1.B.2', '1.B.3',
-                    '1.C',
-                    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                    '2.B', '2.C', '2.D', '2.F', '2.G', '2.H',
-                    '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                    '3.H', '3.I',
-                    '4', '4.A', '4.B', '4.C', '4.D',
-                    '4.E', '4.E.1', '4.E.2', '4.E.3',
-                    '5', '5.A', '5.B', '5.C', '5.D'
+    "aggregate_gases": {
+        "KYOTOGHG": {
+            "basket": "KYOTOGHG (AR4GWP100)",
+            "basket_contents": [
+                "CO2",
+                "CH4",
+                "N2O",
+                "SF6",
+                "HFCS (AR4GWP100)",
+                "PFCS (AR4GWP100)",
+            ],
+            "skipna": True,
+            "min_count": 1,
+            "sel": {
+                f'category ({coords_terminologies["category"]})': [
+                    "0",
+                    "1",
+                    "1.A",
+                    "1.A.1",
+                    "1.A.2",
+                    "1.A.3",
+                    "1.A.4",
+                    "1.A.5",
+                    "1.B",
+                    "1.B.1",
+                    "1.B.2",
+                    "1.B.3",
+                    "1.C",
+                    "2",
+                    "2.A",
+                    "2.A.1",
+                    "2.A.2",
+                    "2.A.3",
+                    "2.A.4",
+                    "2.B",
+                    "2.C",
+                    "2.D",
+                    "2.F",
+                    "2.G",
+                    "2.H",
+                    "3",
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                    "4",
+                    "4.A",
+                    "4.B",
+                    "4.C",
+                    "4.D",
+                    "4.E",
+                    "4.E.1",
+                    "4.E.2",
+                    "4.E.3",
+                    "5",
+                    "5.A",
+                    "5.B",
+                    "5.C",
+                    "5.D",
                 ]
-            }, # not tested
+            },  # not tested
         },
     },
 }
 
 country_processing_step2 = {
-    'downscale': {
+    "downscale": {
         # main sectors present as KYOTOGHG sum. subsectors need to be downscaled
         # TODO: downscale CO, NOx, NMVOC, SO2 (national total present)
-        'sectors': {
-            '1': {
-                'basket': '1',
-                'basket_contents': ['1.A', '1.B', '1.C'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+        "sectors": {
+            "1": {
+                "basket": "1",
+                "basket_contents": ["1.A", "1.B", "1.C"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '1.A': {
-                'basket': '1.A',
-                'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4', '1.A.5'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.A": {
+                "basket": "1.A",
+                "basket_contents": ["1.A.1", "1.A.2", "1.A.3", "1.A.4", "1.A.5"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '1.B': {
-                'basket': '1.B',
-                'basket_contents': ['1.B.1', '1.B.2', '1.B.3'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "1.B": {
+                "basket": "1.B",
+                "basket_contents": ["1.B.1", "1.B.2", "1.B.3"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '2': {
-                'basket': '2',
-                'basket_contents': ['2.A', '2.B', '2.C', '2.D', '2.F', '2.G', '2.H'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2": {
+                "basket": "2",
+                "basket_contents": ["2.A", "2.B", "2.C", "2.D", "2.F", "2.G", "2.H"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '2.A': {
-                'basket': '2.A',
-                'basket_contents': ['2.A.1', '2.A.2', '2.A.3', '2.A.4'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "2.A": {
+                "basket": "2.A",
+                "basket_contents": ["2.A.1", "2.A.2", "2.A.3", "2.A.4"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '3': {
-                'basket': '3',
-                'basket_contents': ['3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                                    '3.H', '3.I'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "3": {
+                "basket": "3",
+                "basket_contents": [
+                    "3.A",
+                    "3.B",
+                    "3.C",
+                    "3.D",
+                    "3.E",
+                    "3.F",
+                    "3.G",
+                    "3.H",
+                    "3.I",
+                ],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '4': {
-                'basket': '4',
-                'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4": {
+                "basket": "4",
+                "basket_contents": ["4.A", "4.B", "4.C", "4.D", "4.E"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '4.E': {
-                'basket': '4.E',
-                'basket_contents': ['4.E.1', '4.E.2', '4.E.3'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "4.E": {
+                "basket": "4.E",
+                "basket_contents": ["4.E.1", "4.E.2", "4.E.3"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
-            '5': {
-                'basket': '5',
-                'basket_contents': ['5.A', '5.B', '5.C', '5.D'],
-                'entities': ['KYOTOGHG (AR4GWP100)'],
-                'dim': f'category ({coords_terminologies["category"]})',
+            "5": {
+                "basket": "5",
+                "basket_contents": ["5.A", "5.B", "5.C", "5.D"],
+                "entities": ["KYOTOGHG (AR4GWP100)"],
+                "dim": f'category ({coords_terminologies["category"]})',
             },
         },
-        'entities': {
-            'KYOTO': {
-                'basket': 'KYOTOGHG (AR4GWP100)',
-                'basket_contents': ['CH4', 'CO2', 'N2O', 'HFCS (AR4GWP100)',
-                                    'PFCS (AR4GWP100)', 'SF6'],
-                'sel': {f'category ({coords_terminologies["category"]})':
-                    [
-                        '1', '1.A', '1.A.1', '1.A.2', '1.A.3',
-                        '1.A.4', '1.A.5', '1.B', '1.B.1', '1.B.2', '1.B.3',
-                        '1.C',
-                        '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4',
-                        '2.B', '2.C', '2.D', '2.F', '2.G', '2.H',
-                        '3', '3.A', '3.B', '3.C', '3.D', '3.E', '3.F', '3.G',
-                        '3.H', '3.I',
-                        '4', '4.A', '4.B', '4.C', '4.D',
-                        '4.E', '4.E.1', '4.E.2', '4.E.3',
-                        '5', '5.A', '5.B', '5.C', '5.D']},
+        "entities": {
+            "KYOTO": {
+                "basket": "KYOTOGHG (AR4GWP100)",
+                "basket_contents": [
+                    "CH4",
+                    "CO2",
+                    "N2O",
+                    "HFCS (AR4GWP100)",
+                    "PFCS (AR4GWP100)",
+                    "SF6",
+                ],
+                "sel": {
+                    f'category ({coords_terminologies["category"]})': [
+                        "1",
+                        "1.A",
+                        "1.A.1",
+                        "1.A.2",
+                        "1.A.3",
+                        "1.A.4",
+                        "1.A.5",
+                        "1.B",
+                        "1.B.1",
+                        "1.B.2",
+                        "1.B.3",
+                        "1.C",
+                        "2",
+                        "2.A",
+                        "2.A.1",
+                        "2.A.2",
+                        "2.A.3",
+                        "2.A.4",
+                        "2.B",
+                        "2.C",
+                        "2.D",
+                        "2.F",
+                        "2.G",
+                        "2.H",
+                        "3",
+                        "3.A",
+                        "3.B",
+                        "3.C",
+                        "3.D",
+                        "3.E",
+                        "3.F",
+                        "3.G",
+                        "3.H",
+                        "3.I",
+                        "4",
+                        "4.A",
+                        "4.B",
+                        "4.C",
+                        "4.D",
+                        "4.E",
+                        "4.E.1",
+                        "4.E.2",
+                        "4.E.3",
+                        "5",
+                        "5.A",
+                        "5.B",
+                        "5.C",
+                        "5.D",
+                    ]
+                },
             },
         },
     },
-    'basket_copy': {
-        'GWPs_to_add': ["SARGWP100", "AR5GWP100", "AR6GWP100"],
-        'entities': ["HFCS", "PFCS"],
-        'source_GWP': gwp_to_use,
+    "basket_copy": {
+        "GWPs_to_add": ["SARGWP100", "AR5GWP100", "AR6GWP100"],
+        "entities": ["HFCS", "PFCS"],
+        "source_GWP": gwp_to_use,
     },
 }
 
 cat_conversion = {
-    'mapping': {
-        '0': '0',
-        'M.0.EL': 'M.0.EL',
-        '1': '1',
-        '1.A': '1.A',
-        '1.A.1': '1.A.1',
-        '1.A.1.a': '1.A.1.a',
-        '1.A.1.b': '1.A.1.b',
-        '1.A.2': '1.A.2',
-        '1.A.3': '1.A.3',
-        '1.A.3.a': '1.A.3.a',
-        '1.A.3.b': '1.A.3.b',
-        '1.A.3.c': '1.A.3.c',
-        '1.A.3.d': '1.A.3.d',
-        '1.A.4': '1.A.4',
-        '1.A.5': '1.A.5',
-        '1.B': '1.B',
-        '1.B.1': '1.B.1',
-        '1.B.2': '1.B.2',
-        '1.B.3': '1.B.3',
-        '1.C': '1.C',
-        '1.C.1': '1.C.1',
-        '1.C.2': '1.C.2',
-        '1.C.3': '1.C.3',
-        '2': '2',
-        '2.A': '2.A',
-        '2.A.1': '2.A.1',
-        '2.A.2': '2.A.2',
-        '2.A.3': '2.A.3',
-        '2.A.4': '2.A.4',
-        '2.A.4.b': '2.A.4.b',
-        '2.A.4.d': '2.A.4.d',
-        '2.B': '2.B',
-        '2.B.2': '2.B.2',
-        '2.B.4': '2.B.4',
-        '2.B.8': '2.B.8',
-        '2.B.8.b': '2.B.8.b',
-        '2.B.8.c': '2.B.8.c',
-        '2.B.8.e': '2.B.8.e',
-        '2.B.8.f': '2.B.8.f',
-        '2.C': '2.C',
-        '2.C.1': '2.C.1',
-        '2.D': '2.D',
-        '2.D.1': '2.D.1',
-        '2.F': '2.F',
-        '2.F.1': '2.F.1',
-        '2.G': '2.G',
-        '2.G.1': '2.G.1',
-        '2.H': '2.H',
-        '2.H.1': '2.H.1',
-        '2.H.2': '2.H.2',
-        '3': 'M.AG',
-        '3.A': '3.A.1',
-        '3.B': '3.A.2',
-        '3.C': 'M.3.C.1.b.i',  # field burning of agricultural residues
-        '3.D': '3.C.2',  # Liming
-        '3.E': '3.C.3',  # urea application
-        '3.F': '3.C.4',  # direct N2O from agri soils
-        '3.G': '3.C.5',  # indirect N2O from agri soils
-        '3.H': '3.C.6',  # indirect N2O from manure management
-        '3.I': '3.C.7',  # rice
+    "mapping": {
+        "0": "0",
+        "M.0.EL": "M.0.EL",
+        "1": "1",
+        "1.A": "1.A",
+        "1.A.1": "1.A.1",
+        "1.A.1.a": "1.A.1.a",
+        "1.A.1.b": "1.A.1.b",
+        "1.A.2": "1.A.2",
+        "1.A.3": "1.A.3",
+        "1.A.3.a": "1.A.3.a",
+        "1.A.3.b": "1.A.3.b",
+        "1.A.3.c": "1.A.3.c",
+        "1.A.3.d": "1.A.3.d",
+        "1.A.4": "1.A.4",
+        "1.A.5": "1.A.5",
+        "1.B": "1.B",
+        "1.B.1": "1.B.1",
+        "1.B.2": "1.B.2",
+        "1.B.3": "1.B.3",
+        "1.C": "1.C",
+        "1.C.1": "1.C.1",
+        "1.C.2": "1.C.2",
+        "1.C.3": "1.C.3",
+        "2": "2",
+        "2.A": "2.A",
+        "2.A.1": "2.A.1",
+        "2.A.2": "2.A.2",
+        "2.A.3": "2.A.3",
+        "2.A.4": "2.A.4",
+        "2.A.4.b": "2.A.4.b",
+        "2.A.4.d": "2.A.4.d",
+        "2.B": "2.B",
+        "2.B.2": "2.B.2",
+        "2.B.4": "2.B.4",
+        "2.B.8": "2.B.8",
+        "2.B.8.b": "2.B.8.b",
+        "2.B.8.c": "2.B.8.c",
+        "2.B.8.e": "2.B.8.e",
+        "2.B.8.f": "2.B.8.f",
+        "2.C": "2.C",
+        "2.C.1": "2.C.1",
+        "2.D": "2.D",
+        "2.D.1": "2.D.1",
+        "2.F": "2.F",
+        "2.F.1": "2.F.1",
+        "2.G": "2.G",
+        "2.G.1": "2.G.1",
+        "2.H": "2.H",
+        "2.H.1": "2.H.1",
+        "2.H.2": "2.H.2",
+        "3": "M.AG",
+        "3.A": "3.A.1",
+        "3.B": "3.A.2",
+        "3.C": "M.3.C.1.b.i",  # field burning of agricultural residues
+        "3.D": "3.C.2",  # Liming
+        "3.E": "3.C.3",  # urea application
+        "3.F": "3.C.4",  # direct N2O from agri soils
+        "3.G": "3.C.5",  # indirect N2O from agri soils
+        "3.H": "3.C.6",  # indirect N2O from manure management
+        "3.I": "3.C.7",  # rice
         #'4': 'M.LULUCF',
-        '4.A': '3.B.1.a',  # forest remaining forest
-        '4.B': '3.B.2.a',  # cropland remaining cropland
-        '4.C': '3.B.2.b',  # land converted to cropland
-        '4.D': '3.B.6.b',  # land converted to other land
+        "4.A": "3.B.1.a",  # forest remaining forest
+        "4.B": "3.B.2.a",  # cropland remaining cropland
+        "4.C": "3.B.2.b",  # land converted to cropland
+        "4.D": "3.B.6.b",  # land converted to other land
         #'4.E': 'M.3.C.1.LU',  # biomass burning (LULUCF)
-        '4.E.1': '3.C.1.a', # biomass burning (Forest Land)
-        '4.E.2': 'M.3.C.1.b.ii', # biomass burning (Cropland)
-        '4.E.3': '3.C.1.d', # biomass burning (Other Land)
-        '5': '4',
-        '5.A': '4.A',
-        '5.A.1': '4.A.1',
-        '5.A.2': '4.A.2',
-        '5.B': '4.B',
-        '5.C': '4.C',
-        '5.C.1': '4.C.1',
-        '5.D': '4.D',
-        '5.D.1': '4.D.1',
-        '5.D.2': '4.D.2',
-        'M.BK': 'M.BK',
-        'M.BK.A': 'M.BK.A',
-        'M.BK.M': 'M.BM.M',
-        'M.BIO': 'M.BIO',
+        "4.E.1": "3.C.1.a",  # biomass burning (Forest Land)
+        "4.E.2": "M.3.C.1.b.ii",  # biomass burning (Cropland)
+        "4.E.3": "3.C.1.d",  # biomass burning (Other Land)
+        "5": "4",
+        "5.A": "4.A",
+        "5.A.1": "4.A.1",
+        "5.A.2": "4.A.2",
+        "5.B": "4.B",
+        "5.C": "4.C",
+        "5.C.1": "4.C.1",
+        "5.D": "4.D",
+        "5.D.1": "4.D.1",
+        "5.D.2": "4.D.2",
+        "M.BK": "M.BK",
+        "M.BK.A": "M.BK.A",
+        "M.BK.M": "M.BM.M",
+        "M.BIO": "M.BIO",
     },
-    'aggregate': {
-        '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
-        '3.C.1.b': {'sources': ['M.3.C.1.b.i', 'M.3.C.1.b.ii'],
-                  'name': 'Biomass Burning In Cropland'},
-        'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
-                  'name': 'Biomass Burning (Agriculture)'},
-        'M.3.C.1.LU': {'sources': ['3.C.1.a', '3.C.1.d'],
-                  'name': 'Biomass Burning (LULUCF)'},
-        '3.C.1': {'sources': ['M.3.C.1.AG', 'M.3.C.1.LU'],
-                  'name': 'Emissions from Biomass Burning'},
-        '3.C': {'sources': ['3.C.1', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-                'name': 'Aggregate sources and non-CO2 emissions sources on land'},
-        'M.3.C.AG': {
-            'sources': ['M.3.C.1.AG', '3.C.2', '3.C.3', '3.C.4', '3.C.5', '3.C.6', '3.C.7'],
-            'name': 'Aggregate sources and non-CO2 emissions sources on land (Agriculture)'},
-        'M.AG.ELV': {'sources': ['M.3.C.AG'],
-                     'name': 'Agriculture excluding livestock emissions'},
-        'M.3.C.LU': {'sources': ['M.3.C.1.LU'],
-                     'name': 'Aggregate sources and non-CO2 emissions sources on land (Land use)'},
-        '3.B.1': {'sources': ['3.B.1.a'], 'name': 'Forest Land'},
-        '3.B.2': {'sources': ['3.B.2.a', '3.B.2.b'], 'name': 'Cropland'},
-        '3.B.6': {'sources': ['3.B.6.b'], 'name': 'Other Land'},
-        '3.B': {'sources': ['3.B.1', '3.B.2', '3.B.6'], 'name': 'Land'},
-        'M.LULUCF': {'sources': ['3.B', 'N.3.C.LU'], 'name': 'LULUCF'},
-        '3': {'sources': ['M.AG', 'M.LULUCF'], 'name': 'AFOLU'},
+    "aggregate": {
+        "3.A": {"sources": ["3.A.1", "3.A.2"], "name": "Livestock"},
+        "3.C.1.b": {
+            "sources": ["M.3.C.1.b.i", "M.3.C.1.b.ii"],
+            "name": "Biomass Burning In Cropland",
+        },
+        "M.3.C.1.AG": {
+            "sources": ["3.C.1.b", "3.C.1.c"],
+            "name": "Biomass Burning (Agriculture)",
+        },
+        "M.3.C.1.LU": {
+            "sources": ["3.C.1.a", "3.C.1.d"],
+            "name": "Biomass Burning (LULUCF)",
+        },
+        "3.C.1": {
+            "sources": ["M.3.C.1.AG", "M.3.C.1.LU"],
+            "name": "Emissions from Biomass Burning",
+        },
+        "3.C": {
+            "sources": ["3.C.1", "3.C.2", "3.C.3", "3.C.4", "3.C.5", "3.C.6", "3.C.7"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land",
+        },
+        "M.3.C.AG": {
+            "sources": [
+                "M.3.C.1.AG",
+                "3.C.2",
+                "3.C.3",
+                "3.C.4",
+                "3.C.5",
+                "3.C.6",
+                "3.C.7",
+            ],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Agriculture)",
+        },
+        "M.AG.ELV": {
+            "sources": ["M.3.C.AG"],
+            "name": "Agriculture excluding livestock emissions",
+        },
+        "M.3.C.LU": {
+            "sources": ["M.3.C.1.LU"],
+            "name": "Aggregate sources and non-CO2 emissions sources on land (Land use)",
+        },
+        "3.B.1": {"sources": ["3.B.1.a"], "name": "Forest Land"},
+        "3.B.2": {"sources": ["3.B.2.a", "3.B.2.b"], "name": "Cropland"},
+        "3.B.6": {"sources": ["3.B.6.b"], "name": "Other Land"},
+        "3.B": {"sources": ["3.B.1", "3.B.2", "3.B.6"], "name": "Land"},
+        "M.LULUCF": {"sources": ["3.B", "N.3.C.LU"], "name": "LULUCF"},
+        "3": {"sources": ["M.AG", "M.LULUCF"], "name": "AFOLU"},
     },
 }
 
 sectors_to_save = [
-    '1', '1.A', '1.A.1', '1.A.1.a', '1.A.1.b', '1.A.2', '1.A.3', '1.A.3.a', '1.A.3.b',
-    '1.A.3.c', '1.A.3.d', '1.A.4', '1.A.5',
-    '1.B', '1.B.1', '1.B.2', '1.B.3', '1.C', '1.C.1', '1.C.2', '1.C.3',
-    '2', '2.A', '2.A.1', '2.A.2', '2.A.3', '2.A.4', '2.A.4.b', '2.A.4.d',
-    '2.B', '2.B.2', '2.B.4', '2.B.8', '2.B.8.a', '2.B.8.c', '2.B.8.e', '2.B.8.f',
-    '2.C', '2.C.1', '2.F', '2.F.1', '2.G', '2.G.1', '2.H', '2.H.1', '2.H.2',
-    '3', 'M.AG', '3.A', '3.A.1', '3.A.2',
-    '3.C', '3.C.1', '3.C.1.a', '3.C.1.b', '3.C.1.d', '3.C.2', '3.C.3', '3.C.4',
-    '3.C.5', '3.C.6', '3.C.7', 'M.3.C.1.AG', 'M.3.C.AG', 'M.AG.ELV',
-    'M.LULUCF', 'M.3.C.1.LU', 'M.3.C.LU', '3.B', '3.B.1', '3.B.1.a', '3.B.2', '3.B.2.a',
-    '3.B.2.b', '3.B.6', '3.B.6.b',
-    '4', '4.A', '4.A.1', '4.A.2', '4.B', '4.C', '4.C.1', '4.D', '4.D.1', '4.D.2',
-    '0', 'M.0.EL', 'M.BK', 'M.BK.A', 'M.BK.M', 'M.BIO']
+    "1",
+    "1.A",
+    "1.A.1",
+    "1.A.1.a",
+    "1.A.1.b",
+    "1.A.2",
+    "1.A.3",
+    "1.A.3.a",
+    "1.A.3.b",
+    "1.A.3.c",
+    "1.A.3.d",
+    "1.A.4",
+    "1.A.5",
+    "1.B",
+    "1.B.1",
+    "1.B.2",
+    "1.B.3",
+    "1.C",
+    "1.C.1",
+    "1.C.2",
+    "1.C.3",
+    "2",
+    "2.A",
+    "2.A.1",
+    "2.A.2",
+    "2.A.3",
+    "2.A.4",
+    "2.A.4.b",
+    "2.A.4.d",
+    "2.B",
+    "2.B.2",
+    "2.B.4",
+    "2.B.8",
+    "2.B.8.a",
+    "2.B.8.c",
+    "2.B.8.e",
+    "2.B.8.f",
+    "2.C",
+    "2.C.1",
+    "2.F",
+    "2.F.1",
+    "2.G",
+    "2.G.1",
+    "2.H",
+    "2.H.1",
+    "2.H.2",
+    "3",
+    "M.AG",
+    "3.A",
+    "3.A.1",
+    "3.A.2",
+    "3.C",
+    "3.C.1",
+    "3.C.1.a",
+    "3.C.1.b",
+    "3.C.1.d",
+    "3.C.2",
+    "3.C.3",
+    "3.C.4",
+    "3.C.5",
+    "3.C.6",
+    "3.C.7",
+    "M.3.C.1.AG",
+    "M.3.C.AG",
+    "M.AG.ELV",
+    "M.LULUCF",
+    "M.3.C.1.LU",
+    "M.3.C.LU",
+    "3.B",
+    "3.B.1",
+    "3.B.1.a",
+    "3.B.2",
+    "3.B.2.a",
+    "3.B.2.b",
+    "3.B.6",
+    "3.B.6.b",
+    "4",
+    "4.A",
+    "4.A.1",
+    "4.A.2",
+    "4.B",
+    "4.C",
+    "4.C.1",
+    "4.D",
+    "4.D.1",
+    "4.D.2",
+    "0",
+    "M.0.EL",
+    "M.BK",
+    "M.BK.A",
+    "M.BK.M",
+    "M.BIO",
+]
 
 
 # gas baskets
 gas_baskets = {
-    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
-    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR5GWP100)':['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
-    'FGASES (AR6GWP100)':['HFCS (AR6GWP100)', 'PFCS (AR6GWP100)', 'SF6', 'NF3'],
-    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (SARGWP100)'],
-    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR4GWP100)'],
-    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR5GWP100)'],
-    'KYOTOGHG (AR6GWP100)': ['CO2', 'CH4', 'N2O', 'FGASES (AR6GWP100)'],
+    "FGASES (SARGWP100)": ["HFCS (SARGWP100)", "PFCS (SARGWP100)", "SF6", "NF3"],
+    "FGASES (AR4GWP100)": ["HFCS (AR4GWP100)", "PFCS (AR4GWP100)", "SF6", "NF3"],
+    "FGASES (AR5GWP100)": ["HFCS (AR5GWP100)", "PFCS (AR5GWP100)", "SF6", "NF3"],
+    "FGASES (AR6GWP100)": ["HFCS (AR6GWP100)", "PFCS (AR6GWP100)", "SF6", "NF3"],
+    "KYOTOGHG (SARGWP100)": ["CO2", "CH4", "N2O", "FGASES (SARGWP100)"],
+    "KYOTOGHG (AR4GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR4GWP100)"],
+    "KYOTOGHG (AR5GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR5GWP100)"],
+    "KYOTOGHG (AR6GWP100)": ["CO2", "CH4", "N2O", "FGASES (AR6GWP100)"],
 }

+ 129 - 89
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -1,10 +1,14 @@
-# this script reads data from Thailand's BUR3
-# Data is read from the pdf file
+"""
+Read Thailand's BUR3 from pdf
 
+This script reads data from Thailand's BUR3
+Data are read from pdf using camelot
+
+"""
 import camelot
 import pandas as pd
 import primap2 as pm2
-from .config_tha_bur3 import (
+from config_tha_bur3 import (
     cat_conversion,
     coords_cols,
     coords_cols_indirect,
@@ -26,53 +30,65 @@ from .config_tha_bur3 import (
     trend_conf,
 )
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
+    input_folder = downloaded_data_path / "UNFCCC" / "Thailand" / "BUR3"
+    output_folder = extracted_data_path / "UNFCCC" / "Thailand"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    inventory_file = 'BUR3_Thailand_251220_.pdf'
-    output_filename = 'THA_BUR3_2020_'
+    inventory_file = "BUR3_Thailand_251220_.pdf"
+    output_filename = "THA_BUR3_2020_"
 
     compression = dict(zlib=True, complevel=9)
 
     # inventory tables
-    pages_inventory = '68,69'
+    pages_inventory = "68,69"
 
     # main sector time series
-    page_main_sector_ts = '70'
+    page_main_sector_ts = "70"
 
     # indirect gases time series
-    page_indirect = '72'
-
+    page_indirect = "72"
 
     # ###
     # read the inventory data and convert to PM2 IF
     # ###
-    tables_inventory = camelot.read_pdf(str(input_folder / inventory_file), pages=pages_inventory,
-                                        split_text=True, flavor="lattice")
+    tables_inventory = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=pages_inventory,
+        split_text=True,
+        flavor="lattice",
+    )
 
     df_inventory = tables_inventory[0].df[1:]
     df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
 
-    df_inventory = pd.concat([df_header, df_inventory, tables_inventory[1].df.iloc[1:]],
-                             axis=0, join='outer')
+    df_inventory = pd.concat(
+        [df_header, df_inventory, tables_inventory[1].df.iloc[1:]], axis=0, join="outer"
+    )
 
-    df_inventory = pm2.pm2io.nir_add_unit_information(df_inventory,
-                                                      unit_row=inv_conf["unit_row"],
-                                                      entity_row=inv_conf["entity_row"],
-                                                      regexp_entity=".*", regexp_unit=".*",
-                                                      default_unit="Gg")
+    df_inventory = pm2.pm2io.nir_add_unit_information(
+        df_inventory,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
     # set index and convert to long format
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
-    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf["year"],
-                                                         inv_conf["header_long"])
+    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
+        df_inventory, inv_conf["year"], inv_conf["header_long"]
+    )
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
 
     # prep for conversion to PM2 IF and native format
@@ -81,24 +97,29 @@ if __name__ == "__main__":
 
     # replace cat names by codes in col "category"
     # first the manual replacements
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].replace(inv_conf["cat_codes_manual"])
+    df_inventory_long["category"] = df_inventory_long["category"].replace(
+        inv_conf["cat_codes_manual"]
+    )
+
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
-                                                  regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
     df_inventory_long = df_inventory_long.reset_index(drop=True)
 
     # replace "," with "" in data
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    df_inventory_long.loc[:, "data"] = \
-        df_inventory_long.loc[:, "data"].str.replace(
-            '(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.\
-        replace(' ','', regex=False)
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.replace(
+        "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+    )
+    df_inventory_long.loc[:, "data"] = df_inventory_long.loc[:, "data"].str.replace(
+        " ", "", regex=False
+    )
 
     # make sure all col headers are str
     df_inventory_long.columns = df_inventory_long.columns.map(str)
@@ -108,27 +129,31 @@ if __name__ == "__main__":
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
         df_inventory_long,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     # ###
     # read the main sector time series and convert to PM2 IF
     # ###
-    tables_main_sector_ts = camelot.read_pdf(str(input_folder / inventory_file), pages=page_main_sector_ts,
-                                        split_text=True, flavor="lattice")
+    tables_main_sector_ts = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=page_main_sector_ts,
+        split_text=True,
+        flavor="lattice",
+    )
 
     df_main_sector_ts = tables_main_sector_ts[0].df.iloc[2:]
-    #df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    #df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
+    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
+    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_main_sector_ts.columns = [trend_conf["header"], trend_conf["unit"]]
 
     df_main_sector_ts = df_main_sector_ts.transpose()
@@ -141,42 +166,49 @@ if __name__ == "__main__":
 
     # replace cat names by codes in col "category"
     df_main_sector_ts["category"] = df_main_sector_ts["category"].replace(
-        trend_conf["cat_codes_manual"])
+        trend_conf["cat_codes_manual"]
+    )
 
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    year_cols = list(set(df_main_sector_ts.columns) - set(['category', 'unit']))
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    year_cols = list(set(df_main_sector_ts.columns) - set(["category", "unit"]))
     for col in year_cols:
-        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.\
-            replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.\
-            replace(' ','', regex=False)
+        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_main_sector_ts.loc[:, col] = df_main_sector_ts.loc[:, col].str.replace(
+            " ", "", regex=False
+        )
 
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
-
+    )
 
     # ###
     # read the indirect gases time series and convert to PM2 IF
     # ###
-    tables_indirect = camelot.read_pdf(str(input_folder / inventory_file), pages=page_indirect,
-                                        split_text=True, flavor="lattice")
+    tables_indirect = camelot.read_pdf(
+        str(input_folder / inventory_file),
+        pages=page_indirect,
+        split_text=True,
+        flavor="lattice",
+    )
 
     df_indirect = tables_indirect[0].df.iloc[2:]
-    #df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    #df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
+    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
+    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_indirect.columns = [ind_conf["header"], ind_conf["unit"]]
 
     df_indirect = df_indirect.transpose()
@@ -188,29 +220,32 @@ if __name__ == "__main__":
     df_indirect = df_indirect.drop(0)
     df_indirect = df_indirect.drop(columns=ind_conf["cols_to_remove"])
 
-    def repl(m):
-        return m.group('part1') + m.group('part2')
-    year_cols = list(set(df_indirect.columns) - set(['entity', 'unit']))
+    def repl(m):  # noqa: D103
+        return m.group("part1") + m.group("part2")
+
+    year_cols = list(set(df_indirect.columns) - set(["entity", "unit"]))
     for col in year_cols:
-        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.\
-            replace('(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$', repl, regex=True)
-        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.\
-            replace(' ','', regex=False)
+        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.replace(
+            "(?P<part1>[0-9]+),(?P<part2>[0-9\\.]+)$", repl, regex=True
+        )
+        df_indirect.loc[:, col] = df_indirect.loc[:, col].str.replace(
+            " ", "", regex=False
+        )
 
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_indirect,
         coords_cols=coords_cols_indirect,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_indirect,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     # ###
     # merge the three datasets
@@ -231,12 +266,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_all_if)
+        data_all_if,
+    )
 
     encoding = {var: compression for var in data_all_pm2.data_vars}
     data_all_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     # ###
     # ## process the data
@@ -244,14 +282,15 @@ if __name__ == "__main__":
     data_proc_pm2 = data_all_pm2
 
     # combine CO2 emissions and removals
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["CO2"].attrs['entity'] = 'CO2'
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
+    data_proc_pm2["CO2"].attrs["entity"] = "CO2"
 
     # actual processing
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         processing_info_country=country_processing_step1,
     )
@@ -261,16 +300,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
 
     # adapt source and metadata
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -279,9 +318,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 90 - 64
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py

@@ -1,18 +1,27 @@
-# this script reads data from Thailand's BUR3
-# Data is read from two csv files which have been created manually from ocr processed
-# pdf files
-# pdftk Thailand_BUR4_final_28122022.pdf cat 65-67east output inventory_2019.pdf
-# ocrmypdf --force-ocr inventory_2019.pdf inventory_2019_ocr.pdf
-# pdftk Thailand_BUR4_final_28122022.pdf cat 69 output trends.pdf
-# ocrmypdf --force-ocr trends.pdf trends_ocr.pdf
-
-# values for HFCs and SF6 have been taken from Table2-9 where they are present in
-# CO2eq and thus HFC data can be used and SF6 data is not 0 as in the mein inventory
-# tables
+"""
+Read Thailand's BUR4 from pdf
+
+This script reads data from Thailand's BUR4
+Data is read from two csv files which have been created manually from ocr processed
+pdf files
+
+.. code-block:: bash
+
+    pdftk Thailand_BUR4_final_28122022.pdf cat 65-67east output inventory_2019.pdf
+    ocrmypdf --force-ocr inventory_2019.pdf inventory_2019_ocr.pdf
+    pdftk Thailand_BUR4_final_28122022.pdf cat 69 output trends.pdf
+    ocrmypdf --force-ocr trends.pdf trends_ocr.pdf
+
+Values for HFCs and SF6 have been taken from Table2-9 where they are present in
+CO2eq and thus HFC data can be used and SF6 data is not 0 as in the mein inventory
+tables
+
+"""
+
 
 import pandas as pd
 import primap2 as pm2
-from .config_tha_bur4 import (
+from config_tha_bur4 import (
     cat_codes_manual_main_sector_ts,
     cat_conversion,
     coords_cols,
@@ -33,36 +42,45 @@ from .config_tha_bur4 import (
     terminology_proc,
 )
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path, process_data_for_country
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
 
 if __name__ == "__main__":
     # ###
     # configuration
     # ###
-    input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR4'
-    output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
+    input_folder = downloaded_data_path / "UNFCCC" / "Thailand" / "BUR4"
+    output_folder = extracted_data_path / "UNFCCC" / "Thailand"
     if not output_folder.exists():
         output_folder.mkdir()
 
-    inventory_file = 'THA_inventory_2019.csv'
-    trends_file = 'THA_trends_2000-2019.csv'
-    indirect_file = 'THA_indirect_2000-2019.csv'
-    output_filename = 'THA_BUR4_2022_'
+    inventory_file = "THA_inventory_2019.csv"
+    trends_file = "THA_trends_2000-2019.csv"
+    indirect_file = "THA_indirect_2000-2019.csv"
+    output_filename = "THA_BUR4_2022_"
 
     compression = dict(zlib=True, complevel=9)
 
-
     # ###
     # read the inventory data and convert to PM2 IF
     # ###
-    df_inventory = pd.read_csv(input_folder /inventory_file, header=None)
+    df_inventory = pd.read_csv(input_folder / inventory_file, header=None)
     df_inventory = pm2.pm2io.nir_add_unit_information(
-        df_inventory, unit_row=inv_conf["unit_row"], entity_row=inv_conf["entity_row"],
-        regexp_entity=".*", regexp_unit=".*", default_unit="Gg")
+        df_inventory,
+        unit_row=inv_conf["unit_row"],
+        entity_row=inv_conf["entity_row"],
+        regexp_entity=".*",
+        regexp_unit=".*",
+        default_unit="Gg",
+    )
     # set index and convert to long format
     df_inventory = df_inventory.set_index(inv_conf["index_cols"])
-    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf["year"],
-                                                         inv_conf["header_long"])
+    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
+        df_inventory, inv_conf["year"], inv_conf["header_long"]
+    )
     df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
 
     # prep for conversion to PM2 IF and native format
@@ -71,14 +89,17 @@ if __name__ == "__main__":
 
     # replace cat names by codes in col "category"
     # first the manual replacements
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].replace(inv_conf["cat_codes_manual"])
+    df_inventory_long["category"] = df_inventory_long["category"].replace(
+        inv_conf["cat_codes_manual"]
+    )
+
     # then the regex replacements
-    def repl(m):
-        return m.group('code')
-    df_inventory_long["category"] = \
-        df_inventory_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
-                                                  regex=True)
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+        inv_conf["cat_code_regexp"], repl, regex=True
+    )
     df_inventory_long = df_inventory_long.reset_index(drop=True)
 
     # make sure all col headers are str
@@ -89,17 +110,17 @@ if __name__ == "__main__":
     data_inventory_IF = pm2.pm2io.convert_long_dataframe_if(
         df_inventory_long,
         coords_cols=coords_cols,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     # ###
     # read the main sector time series and convert to PM2 IF
@@ -115,24 +136,24 @@ if __name__ == "__main__":
     df_main_sector_ts = df_main_sector_ts.drop(0)
 
     # replace cat names by codes in col "category"
-    df_main_sector_ts["category"] = \
-        df_main_sector_ts["category"].replace(cat_codes_manual_main_sector_ts)
+    df_main_sector_ts["category"] = df_main_sector_ts["category"].replace(
+        cat_codes_manual_main_sector_ts
+    )
 
     data_main_sector_ts_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_main_sector_ts,
         coords_cols=coords_cols_main_sector_ts,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_main_sector_ts,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
+        # coords_value_filling=coords_value_filling,
         filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
-        time_format='%Y',
-        )
-
+        time_format="%Y",
+    )
 
     # ###
     # read the indirect gases time series and convert to PM2 IF
@@ -150,17 +171,17 @@ if __name__ == "__main__":
     data_indirect_IF = pm2.pm2io.convert_wide_dataframe_if(
         df_indirect,
         coords_cols=coords_cols_indirect,
-        #add_coords_cols=add_coords_cols,
+        # add_coords_cols=add_coords_cols,
         coords_defaults=coords_defaults_indirect,
         coords_terminologies=coords_terminologies,
         coords_value_mapping=coords_value_mapping,
-        #coords_value_filling=coords_value_filling,
-        #filter_remove=filter_remove,
-        #filter_keep=filter_keep,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
         meta_data=meta_data,
         convert_str=True,
         time_format="%Y",
-        )
+    )
 
     # ###
     # merge the three datasets
@@ -181,12 +202,15 @@ if __name__ == "__main__":
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
         output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-        data_all_if)
+        data_all_if,
+    )
 
     encoding = {var: compression for var in data_all_pm2.data_vars}
     data_all_pm2.pr.to_netcdf(
-        output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-        encoding=encoding)
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
     # ###
     # ## process the data
@@ -194,14 +218,15 @@ if __name__ == "__main__":
     data_proc_pm2 = data_all_pm2
 
     # combine CO2 emissions and removals
-    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum\
-        (dim="entity", skipna=True, min_count=1)
-    data_proc_pm2["CO2"].attrs['entity'] = 'CO2'
+    data_proc_pm2["CO2"] = data_proc_pm2[["CO2 emissions", "CO2 removals"]].pr.sum(
+        dim="entity", skipna=True, min_count=1
+    )
+    data_proc_pm2["CO2"].attrs["entity"] = "CO2"
 
     # actual processing
     data_proc_pm2 = process_data_for_country(
         data_proc_pm2,
-        entities_to_ignore=['CO2 emissions', 'CO2 removals'],
+        entities_to_ignore=["CO2 emissions", "CO2 removals"],
         gas_baskets={},
         processing_info_country=country_processing_step1,
     )
@@ -211,16 +236,16 @@ if __name__ == "__main__":
         entities_to_ignore=[],
         gas_baskets=gas_baskets,
         processing_info_country=country_processing_step2,
-        cat_terminology_out = terminology_proc,
-        category_conversion = cat_conversion,
-        sectors_out = sectors_to_save,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+        sectors_out=sectors_to_save,
     )
 
     # adapt source and metadata
     # TODO: processing info is present twice
-    current_source = data_proc_pm2.coords["source"].values[0]
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
     data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-    data_proc_pm2 = data_proc_pm2.pr.set("source", 'BUR_NIR', data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
 
     # ###
     # save data to IF and native format
@@ -229,9 +254,10 @@ if __name__ == "__main__":
     if not output_folder.exists():
         output_folder.mkdir()
     pm2.pm2io.write_interchange_format(
-        output_folder / (output_filename + terminology_proc), data_proc_if)
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
     encoding = {var: compression for var in data_proc_pm2.data_vars}
     data_proc_pm2.pr.to_netcdf(
-        output_folder / (output_filename + terminology_proc + ".nc"),
-        encoding=encoding)
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )

+ 16 - 1
src/unfccc_ghg_data/unfccc_reader/__init__.py

@@ -1 +1,16 @@
-#
+"""Read individual country submissions
+
+The UNFCCC reader contains code to read individual country inventories,
+mostly submitted by non-AnnexI countries to the UNFCCC as Biannial Update Reports (
+BUR), National Communications (NC), and National Inventory Reports (NIR). Code tyo
+read other official country repositories is also included here as it uses the same
+setup.
+
+The code is organized in country folders which contain scripts for each submission
+and configuration files which can also be used for several submissions if the
+configuration is sufficiently similar.
+
+Data are mostly read from pdf files using camelot, but in some cases machine-readable
+files like xlsx are available which we prefer over pdfs.
+
+"""

+ 26 - 19
src/unfccc_ghg_data/unfccc_reader/get_submissions_info.py

@@ -1,19 +1,28 @@
-# helper functions to get information on available submissions
-# and data reading functions for a given country
+"""
+Helper functions for the unfccc_reader
+
+helper functions to get information on available submissions
+and data reading functions for a given country
+"""
 
 import json
 from pathlib import Path
 
-from unfccc_ghg_data.helper import (downloaded_data_path, extracted_data_path,
-                                    get_country_code, root_path)
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    get_country_code,
+    root_path,
+)
 
 code_path = root_path / "src" / "unfccc_ghg_data" / "unfccc_reader"
 # TODO: change this to use the code path stored in the helper module
 
+
 def get_possible_inputs(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
 ) -> list[Path]:
     """
     For given country name and submission find the possible input files
@@ -71,10 +80,10 @@ def get_possible_inputs(
 
 
 def get_possible_outputs(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
-)-> list[Path]:
+    country_name: str,
+    submission: str,
+    print_info: bool = False,
+) -> list[Path]:
     """
     For given country name and submission find the possible output files
 
@@ -109,11 +118,15 @@ def get_possible_outputs(
             if country_code in folder_mapping:
                 country_folder = folder_mapping[country_code]
                 if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+                    raise ValueError(
+                        "Wrong data type in folder mapping " "json file. Should be str."
+                    )
 
                 output_folder = item / country_folder
                 if output_folder.exists():
-                    for filepath in output_folder.glob(country_code + "_" + submission + "*"):
+                    for filepath in output_folder.glob(
+                        country_code + "_" + submission + "*"
+                    ):
                         output_files.append(filepath.relative_to(root_path))
 
     if print_info:
@@ -125,9 +138,3 @@ def get_possible_outputs(
             print("No output files found")
 
     return output_files
-
-
-
-
-
-

+ 25 - 14
src/unfccc_ghg_data/unfccc_reader/read_UNFCCC_submission.py

@@ -1,28 +1,34 @@
-# this script takes submission and country as input (from doit) and
-# runs the appropriate script to extract the submission data
+"""
+wrapper to read UNFCCC submission
+
+Take submission and country as input (from doit) and
+run the appropriate script to extract the submission data
+
+"""
+
 
 import argparse
 
 import datalad.api
-from .get_submissions_info import (get_possible_inputs, get_possible_outputs)
 
 from unfccc_ghg_data.helper import get_code_file, root_path
 
+from .get_submissions_info import get_possible_inputs, get_possible_outputs
+
 if __name__ == "__main__":
     # Find the right function and possible input and output files and
     # read the data using datalad run.
     parser = argparse.ArgumentParser()
-    parser.add_argument('--country', help='Country name or code')
-    parser.add_argument('--submission', help='Submission to read')
+    parser.add_argument("--country", help="Country name or code")
+    parser.add_argument("--submission", help="Submission to read")
 
     args = parser.parse_args()
 
     country = args.country
     submission = args.submission
 
-
     print(f"Attempting to extract data for {submission} from {country}.")
-    print("#"*80)
+    print("#" * 80)
     print("")
 
     # get the correct script
@@ -35,8 +41,10 @@ if __name__ == "__main__":
         # get possible input files
         input_files = get_possible_inputs(country, submission)
         if not input_files:
-            print(f"No possible input files found for {country}, {submission}. "
-                  f"Something might be wrong here.")
+            print(
+                f"No possible input files found for {country}, {submission}. "
+                f"Something might be wrong here."
+            )
         else:
             print("Found the following input_files:")
             for file in input_files:
@@ -51,8 +59,10 @@ if __name__ == "__main__":
         # get possible output files
         output_files = get_possible_outputs(country, submission)
         if not output_files:
-            print(f"No possible output files found for {country}, {submission}. "
-                  f"This is either the first run or something is wrong.")
+            print(
+                f"No possible output files found for {country}, {submission}. "
+                f"This is either the first run or something is wrong."
+            )
         else:
             print("Found the following output_files:")
             for file in output_files:
@@ -74,6 +84,7 @@ if __name__ == "__main__":
     else:
         # no code found.
         print(f"No code found to read {submission} from {country}")
-        print(f"Use 'doit country_info country={country} to get "
-              f"a list of available submissions and datasets.")
-
+        print(
+            f"Use 'doit country_info country={country} to get "
+            f"a list of available submissions and datasets."
+        )