Procházet zdrojové kódy

Docs now compile fine

Johannes Gütschow před 10 měsíci
rodič
revize
c09c78267a
77 změnil soubory, kde provedl 2661 přidání a 2345 odebrání
  1. 4 3
      .copier-answers.yml
  2. 41 0
      .github/ISSUE_TEMPLATE/bug.md
  3. 21 0
      .github/ISSUE_TEMPLATE/default.md
  4. 32 0
      .github/ISSUE_TEMPLATE/feature_request.md
  5. 28 22
      .github/actions/setup/action.yml
  6. 9 0
      .github/pull_request_template.md
  7. 12 5
      .github/workflows/bump.yaml
  8. 11 0
      .github/workflows/ci.yaml
  9. 32 2
      .github/workflows/install.yaml
  10. 1 2
      .github/workflows/release.yaml
  11. 3 0
      .gitignore
  12. 4 12
      .pre-commit-config.yaml
  13. 8 3
      .readthedocs.yaml
  14. 1 0
      LICENCE
  15. 5 5
      Makefile
  16. 6 7
      README.md
  17. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_downloader.download_btr.rst
  18. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_downloader.fetch_submissions_btr.rst
  19. 2 0
      docs/source/api/unfccc_ghg_data.unfccc_downloader.rst
  20. 12 0
      docs/source/api/unfccc_ghg_data.unfccc_downloader.unfccc_submission_info.rst
  21. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.config_arg_bur5.rst
  22. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR5_from_csv.rst
  23. 2 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.rst
  24. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.config_bdi_bur1.rst
  25. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.read_BDI_BUR1_from_pdf.rst
  26. 13 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.rst
  27. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.config_gin_bur1.rst
  28. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.read_GIN_BUR1_from_pdf.rst
  29. 13 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.rst
  30. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Israel.config_isr_bur2.rst
  31. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Mexico.config_mex_bur3.rst
  32. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2.rst
  33. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.read_MNG_BUR2_from_pdf.rst
  34. 13 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.rst
  35. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_KOR_INV2023.rst
  36. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2023_Inventory_from_xlsx.rst
  37. 2 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.rst
  38. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2022.rst
  39. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2023.rst
  40. 6 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2023_Inventory_from_pdf.rst
  41. 2 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.rst
  42. 3 0
      docs/source/api/unfccc_ghg_data.unfccc_reader.rst
  43. 11 3
      docs/source/conf.py
  44. 31 3
      docs/source/development.md
  45. 0 2
      docs/source/index.md
  46. 0 23
      docs/source/notebooks.md
  47. 0 24
      docs/source/notebooks/basic-demo.py
  48. 95 159
      poetry.lock
  49. 9 7
      pyproject.toml
  50. 131 125
      src/unfccc_ghg_data/unfccc_downloader/download_btr.py
  51. 84 81
      src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_btr.py
  52. 5 5
      src/unfccc_ghg_data/unfccc_downloader/unfccc_submission_info.py
  53. 108 105
      src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR5_from_csv.py
  54. 178 178
      src/unfccc_ghg_data/unfccc_reader/Burundi/read_BDI_BUR1_from_pdf.py
  55. 6 3
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py
  56. 6 3
      src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py
  57. 566 548
      src/unfccc_ghg_data/unfccc_reader/Guinea/read_GIN_BUR1_from_pdf.py
  58. 7 7
      src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py
  59. 10 9
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py
  60. 10 9
      src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py
  61. 5 3
      src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py
  62. 273 270
      src/unfccc_ghg_data/unfccc_reader/Mongolia/read_MNG_BUR2_from_pdf.py
  63. 5 3
      src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py
  64. 10 7
      src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py
  65. 9 8
      src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py
  66. 11 10
      src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py
  67. 5 4
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py
  68. 5 4
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py
  69. 0 384
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2023-Inventory_from_xlsx.py
  70. 398 0
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2023_Inventory_from_xlsx.py
  71. 2 1
      src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
  72. 17 9
      src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py
  73. 0 258
      src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2023-Inventory_from_pdf.py
  74. 266 0
      src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2023_Inventory_from_pdf.py
  75. 8 11
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py
  76. 8 7
      src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py
  77. 0 11
      tests/unit/test_operations.py

+ 4 - 3
.copier-answers.yml

@@ -1,8 +1,8 @@
-# Changes here will be overwritten by Copier
-_commit: v0.3.0
+# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
+_commit: v0.6.2
 _src_path: ../../../helper_tools/copier-core-python-repository/
 email: mail@johannes-guetschow.de
-initial_setup: true
+initial_setup: false
 name: Johannes Gütschow
 notebook_dependencies: false
 pandas_doctests: false
@@ -16,3 +16,4 @@ project_name_human: Country greenhouse gas data submitted to the UNFCCC
 project_name_pip: unfccc-ghg-data
 project_name_python: unfccc_ghg_data
 project_url: https://github.com/JGuetschow/UNFCCC_non-AnnexI_data
+track_lock_file: true

+ 41 - 0
.github/ISSUE_TEMPLATE/bug.md

@@ -0,0 +1,41 @@
+---
+name: Bug report
+about: Report a bug
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+## Describe the bug
+<!--- A clear and concise description of what the bug is. -->
+
+## Failing Test
+<!---
+Please put the code (ideally in the form of a unit test) which fails below.
+
+e.g.
+
+```python
+def test_bug_12():
+    # Python code here which fails because of the bug
+    # This is best if other developers can simply copy and paste this test in
+    # order to run it
+```
+-->
+
+## Expected behavior
+<!--- A clear and concise description of what you expected to happen. -->
+
+## Screenshots
+<!--- If applicable, add screenshots to help explain your problem. -->
+
+## System
+<!--- Please complete the following information. -->
+
+ - OS: [e.g. Windows, Linux, macOS]
+ - Python version [e.g. Python 3.11]
+ - Please also upload your `poetry.lock` file (first run `poetry lock` to make sure the lock file is up-to-date)
+
+## Additional context
+<!--- Add any other context about the problem here. -->

+ 21 - 0
.github/ISSUE_TEMPLATE/default.md

@@ -0,0 +1,21 @@
+---
+name: Default
+about: Report an issue or problem
+title: ''
+labels: triage
+assignees: ''
+
+---
+
+## The problem
+<!--- Useful to breakdown to "As a [persona], I [want to do], so that [reason] -->
+
+## Definition of "done"
+<!---
+What are the things that must be true in order to close this issue
+
+We find that describing these as dot points works well.
+-->
+
+## Additional context
+<!--- Add any additional context can go here -->

+ 32 - 0
.github/ISSUE_TEMPLATE/feature_request.md

@@ -0,0 +1,32 @@
+---
+name: Feature Request
+about: Request a feature or suggest an idea for this project
+title: ''
+labels: feature
+assignees: ''
+
+---
+
+## The motivation
+
+<!--- Useful to breakdown to "As a [persona], I [want to do], so that [reason] -->
+
+## The proposed solution
+
+<!---
+If you'd like, please provide a description of the solution you would like to see
+
+If you don't have any ideas for the solution, simply leave this blank
+-->
+
+## Alternatives
+
+<!---
+If you've considered any alternatives, please describe them here
+
+If you don't have any alternatives, simply leave this blank
+-->
+
+## Additional context
+
+<!--- Add any additional context can go here -->

+ 28 - 22
.github/actions/setup/action.yml

@@ -2,6 +2,10 @@ name: "Setup Python and Poetry"
 description: "setup Python and Poetry with caches"
 
 inputs:
+  os:
+    description: "Operating system to use"
+    required: false
+    default: "ubuntu-latest"
   python-version:
     description: "Python version to use"
     required: true
@@ -20,38 +24,40 @@ inputs:
 runs:
   using: "composite"
   steps:
+    - name: Install poetry
+      shell: bash
+      run: |
+        pipx install poetry
+        which poetry
+        poetry --version  # Check poetry installation
+
     - name: Set up Python ${{ inputs.python-version }}
       id: setup-python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ inputs.python-version }}
-    - name: Load cached Poetry installation
-      id: cached-poetry
-      uses: actions/cache@v3
-      with:
-        path: ~/.local  # the path depends on the OS
-        key: poetry-2  # increment to reset cache
-    - name: Install Poetry
-      if: steps.cached-poetry.outputs.cache-hit != 'true'
-      uses: snok/install-poetry@v1
-    - name: Load cached venv
-      if: ${{ inputs.run-poetry-install == 'true' }}
-      id: cached-poetry-dependencies
-      uses: actions/cache@v3
-      with:
-        path: .venv
-        key: "venv-${{ inputs.venv-id }}-${{ inputs.python-version }}-${{ hashFiles('**/poetry.lock') }}"
-    # Install dependencies first so that our package doesn't appear in the cache by accident
-    - name: Install dependencies
-      if: ${{ inputs.run-poetry-install == 'true' }}
+        cache: poetry
+    - name: Set Poetry environment
       shell: bash
       run: |
+        # This line used to be needed, but seems to have been
+        # sorted with newer poetry versions. We can still check whether
+        # the right version of python is used by looking at the output of
+        # `poetry run which python` below and whether the right version
+        # of python is used in the tests (or whatever step is being done)
+        # poetry env use "python${{ inputs.python-version }}"
         poetry config virtualenvs.create true
         poetry config virtualenvs.in-project true
-        poetry env use ${{ inputs.python-version }}
+    - name: Install dependencies
+      if: ${{ (inputs.run-poetry-install == 'true')  && (steps.setup-python.outputs.cache-hit != 'true') }}
+      shell: bash
+      run: |
         poetry install --no-interaction --no-root ${{ inputs.poetry-dependency-install-flags }}
     # Now run same command but let the package install too
     - name: Install package
+      # To ensure that the package is always installed, this step is run even if the cache was hit
       if: ${{ inputs.run-poetry-install == 'true' }}
       shell: bash
-      run: poetry install --no-interaction ${{ inputs.poetry-dependency-install-flags }}
+      run: |
+        poetry install --no-interaction ${{ inputs.poetry-dependency-install-flags }}
+        poetry run python --version  # Check python version just in case

+ 9 - 0
.github/pull_request_template.md

@@ -0,0 +1,9 @@
+## Description
+
+## Checklist
+
+Please confirm that this pull request has done the following:
+
+- [ ] Tests added
+- [ ] Documentation added (where applicable)
+- [ ] Changelog item added to `changelog/`

+ 12 - 5
.github/workflows/bump.yaml

@@ -47,14 +47,21 @@ jobs:
       - name: Create bump and changelog
 
         run: |
-          BASE_VERSION=`poetry version -s`
-          NEW_VERSION=`poetry version -s ${{ github.event.inputs.bump_rule }}`
-          poetry run towncrier build --yes --version v$NEW_VERSION
-
           git config --global user.name "$GITHUB_ACTOR"
           git config --global user.email "$CI_COMMIT_EMAIL"
 
+          # Bump
+          BASE_VERSION=`poetry version -s`
+          NEW_VERSION=`poetry version -s ${{ github.event.inputs.bump_rule }}`
+          echo "Bumping version $BASE_VERSION > $NEW_VERSION"
+          poetry run towncrier build --yes --version v$NEW_VERSION
           git commit -a -m "bump: version $BASE_VERSION -> $NEW_VERSION"
           git tag v$NEW_VERSION
+
+          # Bump to alpha (so that future commits do not have the same
+          # version as the tagged commit)
+          BASE_VERSION=`poetry version -s`
+          NEW_VERSION=`poetry version -s prerelease`
+          echo "Bumping version $BASE_VERSION > $NEW_VERSION"
+          git commit -a -m "bump(pre-release): version $BASE_VERSION > $NEW_VERSION"
           git push && git push --tags
-          echo "Bumped to version $NEW_VERSION"

+ 11 - 0
.github/workflows/ci.yaml

@@ -15,6 +15,7 @@ jobs:
         uses: actions/checkout@v3
       - uses: ./.github/actions/setup
         with:
+          os: "ubuntu-latest"
           python-version: "3.9"
           venv-id: "docs"
           poetry-dependency-install-flags: "--all-extras --only 'main,dev'"
@@ -29,6 +30,7 @@ jobs:
         uses: actions/checkout@v3
       - uses: ./.github/actions/setup
         with:
+          os: "ubuntu-latest"
           python-version: "3.9"
           venv-id: "docs"
           poetry-dependency-install-flags: "--all-extras --only 'main,docs'"
@@ -42,11 +44,18 @@ jobs:
         os: [ "ubuntu-latest" ]
         python-version: [ "3.9", "3.10", "3.11" ]
     runs-on: "${{ matrix.os }}"
+    defaults:
+      run:
+        # This might be needed for Windows and doesn't seem to affect unix-based systems
+        # so we include it. If you have better proof of whether this is needed or not,
+        # feel free to update.
+        shell: bash
     steps:
       - name: Check out repository
         uses: actions/checkout@v3
       - uses: ./.github/actions/setup
         with:
+          os: "${{ matrix.os }}"
           python-version: "${{ matrix.python-version }}"
           venv-id: "tests-${{ runner.os }}"
           poetry-dependency-install-flags: "--all-extras"
@@ -92,6 +101,8 @@ jobs:
       - name: Check build
         run: |
           tar -tvf dist/unfccc_ghg_data-*.tar.gz --wildcards '*unfccc_ghg_data/py.typed'
+          tar -tvf dist/unfccc_ghg_data-*.tar.gz --wildcards 'unfccc_ghg_data-*/LICENCE'
+
 
   check-dependency-licences:
     runs-on: ubuntu-latest

+ 32 - 2
.github/workflows/install.yaml

@@ -1,12 +1,14 @@
-name: Install
+name: Test installation
 
 on:
+  workflow_dispatch:
   schedule:
     # * is a special character in YAML so you have to quote this string
     - cron:  '0 0 * * 3'
 
 jobs:
   test-pypi-install:
+    name: Test PyPI install (${{ matrix.python-version }}, ${{ matrix.os }})
     runs-on: "${{ matrix.os }}"
     strategy:
       fail-fast: false
@@ -25,6 +27,34 @@ jobs:
         pip install unfccc-ghg-data
     - name: Checkout repository
       uses: actions/checkout@v3
-    - name: Test installation (${{ matrix.os }} Python ${{ matrix.python-version }})
+    - name: Test installation
       run: |
+        which python
+        python scripts/test-install.py
+
+  test-micromamba-installation:
+    name: Test (micro)mamba install (${{ matrix.python-version }}, ${{ matrix.os }})
+    runs-on: "${{ matrix.os }}"
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        python-version: [ "3.9", "3.10", "3.11" ]
+
+    steps:
+    - name: Setup (micro)mamba and install package
+      uses: mamba-org/setup-micromamba@v1
+      with:
+        environment-name: test-mamba-install
+        create-args: >-
+          python=${{ matrix.python-version }}
+          -c conda-forge
+          unfccc-ghg-data
+        init-shell: bash
+    - name: Checkout repository
+      uses: actions/checkout@v3
+    - name: Test installation
+      shell: bash -leo pipefail {0}
+      run: |
+        which python
         python scripts/test-install.py

+ 1 - 2
.github/workflows/release.yaml

@@ -39,9 +39,8 @@ jobs:
         uses: softprops/action-gh-release@v1
         with:
           body_path: ".github/release_template.md"
+          token: "${{ secrets.PERSONAL_ACCESS_TOKEN }}"
           draft: true
           files: |
             dist/unfccc_ghg_data-${{ env.PROJECT_VERSION }}-py3-none-any.whl
             dist/unfccc_ghg_data-${{ env.PROJECT_VERSION }}.tar.gz
-        env:
-          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"

+ 3 - 0
.gitignore

@@ -17,6 +17,9 @@ geckodriver.log
 # Jupyter cache
 .jupyter_cache
 
+# Ruff cache
+.ruff_cache
+
 # Licence check
 licence-check.txt
 

+ 4 - 12
.pre-commit-config.yaml

@@ -7,7 +7,7 @@ ci:
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: 'v4.4.0'
+    rev: 'v4.5.0'
     hooks:
       - id: check-added-large-files
       - id: check-ast
@@ -31,20 +31,12 @@ repos:
         language: fail
         files: "\\.rej$"
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.0.263'
+    rev: 'v0.1.8'
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix ]
-  - repo: https://github.com/psf/black
-    rev: '23.3.0'
-    hooks:
-      - id: black
-  # additional to the above, apply black to doctests in source code
-  - repo: https://github.com/keewis/blackdoc
-    rev: v0.3.8
-    hooks:
-      - id: blackdoc
+      - id: ruff-format
   - repo: https://github.com/python-poetry/poetry
-    rev: '1.4.2'
+    rev: '1.7.0'
     hooks:
       - id: poetry-check

+ 8 - 3
.readthedocs.yaml

@@ -11,14 +11,19 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.9"
+    python: "3.11"
   jobs:
     post_create_environment:
       - pip install poetry
       - poetry config virtualenvs.create false
     post_install:
-      - poetry install --with docs --all-extras
+      # RtD seems to be not happy with poetry installs,
+      # hence use pip directly instead.
+      - poetry export -f requirements.txt --output requirements.txt --with docs
+      - python -m pip install -r requirements.txt
+      - python -m pip install .
+      - python -m pip list
 
-# Build documentation in the docs/ directory with Sphinx
+# Set sphinx configuration
 sphinx:
    configuration: docs/source/conf.py

+ 1 - 0
LICENCE

@@ -0,0 +1 @@
+To be decided by project implementer

+ 5 - 5
Makefile

@@ -18,6 +18,7 @@ endef
 export PRINT_HELP_PYSCRIPT
 
 
+.PHONY: help
 help:  ## print short description of each target
 	@python3 -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
 
@@ -31,14 +32,13 @@ checks:  ## run all the linting checks of the codebase
 		echo "=== mypy ==="; MYPYPATH=stubs poetry run mypy src || echo "--- mypy failed ---" >&2; \
 		echo "======"
 
-.PHONY: black
-black:  ## format the code using black
-	poetry run black src tests docs/source/conf.py scripts docs/source/notebooks/*.py
-	poetry run blackdoc src
-
 .PHONY: ruff-fixes
 ruff-fixes:  ## fix the code using ruff
+    # format before and after checking so that the formatted stuff is checked and
+    # the fixed stuff is formatted
+	poetry run ruff format src tests scripts docs/source/conf.py docs/source/notebooks/*.py
 	poetry run ruff src tests scripts docs/source/conf.py docs/source/notebooks/*.py --fix
+	poetry run ruff format src tests scripts docs/source/conf.py docs/source/notebooks/*.py
 
 .PHONY: ruff-fixes-current
 ruff-fixes-current:  ## fix the code using ruff

+ 6 - 7
README.md

@@ -8,7 +8,7 @@ Reading country greenhouse gas data submitted to the United Nations Framework Co
 
 
 [![CI](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/ci.yaml)
-[![Coverage](https://codecov.io/gh/climate-resource/UNFCCC_non-AnnexI_data/branch/main/graph/badge.svg)](https://codecov.io/gh/climate-resource/UNFCCC_non-AnnexI_data)
+[![Coverage](https://codecov.io/gh/JGuetschow/UNFCCC_non-AnnexI_data/branch/main/graph/badge.svg)](https://codecov.io/gh/JGuetschow/UNFCCC_non-AnnexI_data)
 [![Docs](https://readthedocs.org/projects/unfccc-ghg-data/badge/?version=latest)](https://unfccc-ghg-data.readthedocs.io)
 
 **PyPI :**
@@ -17,7 +17,7 @@ Reading country greenhouse gas data submitted to the United Nations Framework Co
 [![PyPI install](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/install.yaml/badge.svg?branch=main)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/install.yaml)
 
 **Other info :**
-[![License](https://img.shields.io/github/license/JGuetschow/UNFCCC_non-AnnexI_data.svg)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/blob/main/LICENSE)
+[![Licence](https://img.shields.io/github/license/JGuetschow/UNFCCC_non-AnnexI_data.svg)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/blob/main/LICENCE)
 [![Last Commit](https://img.shields.io/github/last-commit/JGuetschow/UNFCCC_non-AnnexI_data.svg)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/commits/main)
 [![Contributors](https://img.shields.io/github/contributors/JGuetschow/UNFCCC_non-AnnexI_data.svg)](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/graphs/contributors)
 
@@ -33,10 +33,11 @@ don't render correctly on GitHub's viewer.
 
 <!--- sec-begin-installation -->
 
-Country greenhouse gas data submitted to the UNFCCC can be installed with conda or pip:
+Country greenhouse gas data submitted to the UNFCCC can be installed with pip, mamba or conda:
 
 ```bash
 pip install unfccc-ghg-data
+mamba install -c conda-forge unfccc-ghg-data
 conda install -c conda-forge unfccc-ghg-data
 ```
 
@@ -69,12 +70,11 @@ but we generally discourage this because it can be error prone.
 In order to create your environment, run `make virtual-environment`.
 
 If there are any issues, the messages from the `Makefile` should guide you
-through. If not, please raise an issue in the [issue tracker][issue_tracker].
+through. If not, please raise an issue in the
+[issue tracker](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/issues).
 
 For the rest of our developer docs, please see [](development-reference).
 
-[issue_tracker]: https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/issues
-
 <!--- sec-end-installation-dev -->
 
 
@@ -309,4 +309,3 @@ files, you can find out what has been done to the dataset or to individual
 files by whom, and when.
 
 <!--- sec-end-datalad -->
-

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_downloader.download_btr.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_downloader.download\_btr
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_downloader.download_btr
+
+.. currentmodule:: unfccc_ghg_data.unfccc_downloader.download_btr

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_downloader.fetch_submissions_btr.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_downloader.fetch\_submissions\_btr
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_downloader.fetch_submissions_btr
+
+.. currentmodule:: unfccc_ghg_data.unfccc_downloader.fetch_submissions_btr

+ 2 - 0
docs/source/api/unfccc_ghg_data.unfccc_downloader.rst

@@ -10,9 +10,11 @@ unfccc\_ghg\_data.unfccc\_downloader
   :toctree: ./
 
   unfccc_ghg_data.unfccc_downloader.download_annexI
+  unfccc_ghg_data.unfccc_downloader.download_btr
   unfccc_ghg_data.unfccc_downloader.download_ndc
   unfccc_ghg_data.unfccc_downloader.download_nonannexI
   unfccc_ghg_data.unfccc_downloader.fetch_submissions_annexI
+  unfccc_ghg_data.unfccc_downloader.fetch_submissions_btr
   unfccc_ghg_data.unfccc_downloader.fetch_submissions_bur
   unfccc_ghg_data.unfccc_downloader.fetch_submissions_nc
   unfccc_ghg_data.unfccc_downloader.unfccc_submission_info

+ 12 - 0
docs/source/api/unfccc_ghg_data.unfccc_downloader.unfccc_submission_info.rst

@@ -5,3 +5,15 @@ unfccc\_ghg\_data.unfccc\_downloader.unfccc\_submission\_info
 
 .. currentmodule:: unfccc_ghg_data.unfccc_downloader.unfccc_submission_info
 
+
+
+get\_unfccc\_submission\_info
+=============================
+
+.. autofunction:: get_unfccc_submission_info
+
+
+get\_BTR\_name\_and\_URL
+========================
+
+.. autofunction:: get_BTR_name_and_URL

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.config_arg_bur5.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Argentina.config\_arg\_bur5
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Argentina.config_arg_bur5
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Argentina.config_arg_bur5

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR5_from_csv.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Argentina.read\_ARG\_BUR5\_from\_csv
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR5_from_csv
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR5_from_csv

+ 2 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Argentina.rst

@@ -9,4 +9,6 @@ unfccc\_ghg\_data.unfccc\_reader.Argentina
 .. autosummary::
   :toctree: ./
 
+  unfccc_ghg_data.unfccc_reader.Argentina.config_arg_bur5
   unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR4_from_pdf
+  unfccc_ghg_data.unfccc_reader.Argentina.read_ARG_BUR5_from_csv

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.config_bdi_bur1.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Burundi.config\_bdi\_bur1
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Burundi.config_bdi_bur1
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Burundi.config_bdi_bur1

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.read_BDI_BUR1_from_pdf.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Burundi.read\_BDI\_BUR1\_from\_pdf
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Burundi.read_BDI_BUR1_from_pdf
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Burundi.read_BDI_BUR1_from_pdf

+ 13 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Burundi.rst

@@ -0,0 +1,13 @@
+unfccc\_ghg\_data.unfccc\_reader.Burundi
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Burundi
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Burundi
+
+
+.. autosummary::
+  :toctree: ./
+
+  unfccc_ghg_data.unfccc_reader.Burundi.config_bdi_bur1
+  unfccc_ghg_data.unfccc_reader.Burundi.read_BDI_BUR1_from_pdf

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.config_gin_bur1.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Guinea.config\_gin\_bur1
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Guinea.config_gin_bur1
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Guinea.config_gin_bur1

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.read_GIN_BUR1_from_pdf.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Guinea.read\_GIN\_BUR1\_from\_pdf
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Guinea.read_GIN_BUR1_from_pdf
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Guinea.read_GIN_BUR1_from_pdf

+ 13 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Guinea.rst

@@ -0,0 +1,13 @@
+unfccc\_ghg\_data.unfccc\_reader.Guinea
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Guinea
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Guinea
+
+
+.. autosummary::
+  :toctree: ./
+
+  unfccc_ghg_data.unfccc_reader.Guinea.config_gin_bur1
+  unfccc_ghg_data.unfccc_reader.Guinea.read_GIN_BUR1_from_pdf

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Israel.config_isr_bur2.rst

@@ -5,3 +5,9 @@ unfccc\_ghg\_data.unfccc\_reader.Israel.config\_isr\_bur2
 
 .. currentmodule:: unfccc_ghg_data.unfccc_reader.Israel.config_isr_bur2
 
+
+
+is\_int
+=======
+
+.. autofunction:: is_int

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Mexico.config_mex_bur3.rst

@@ -5,3 +5,9 @@ unfccc\_ghg\_data.unfccc\_reader.Mexico.config\_mex\_bur3
 
 .. currentmodule:: unfccc_ghg_data.unfccc_reader.Mexico.config_mex_bur3
 
+
+
+fix\_rows
+=========
+
+.. autofunction:: fix_rows

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Mongolia.config\_mng\_bur2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.read_MNG_BUR2_from_pdf.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Mongolia.read\_MNG\_BUR2\_from\_pdf
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Mongolia.read_MNG_BUR2_from_pdf
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Mongolia.read_MNG_BUR2_from_pdf

+ 13 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Mongolia.rst

@@ -0,0 +1,13 @@
+unfccc\_ghg\_data.unfccc\_reader.Mongolia
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Mongolia
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Mongolia
+
+
+.. autosummary::
+  :toctree: ./
+
+  unfccc_ghg_data.unfccc_reader.Mongolia.config_mng_bur2
+  unfccc_ghg_data.unfccc_reader.Mongolia.read_MNG_BUR2_from_pdf

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_KOR_INV2023.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Republic\_of\_Korea.config\_KOR\_INV2023
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_KOR_INV2023
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_KOR_INV2023

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2023_Inventory_from_xlsx.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Republic\_of\_Korea.read\_KOR\_2023\_Inventory\_from\_xlsx
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2023_Inventory_from_xlsx
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2023_Inventory_from_xlsx

+ 2 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Republic_of_Korea.rst

@@ -9,7 +9,9 @@ unfccc\_ghg\_data.unfccc\_reader.Republic\_of\_Korea
 .. autosummary::
   :toctree: ./
 
+  unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_KOR_INV2023
   unfccc_ghg_data.unfccc_reader.Republic_of_Korea.config_kor_bur4
   unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2021_Inventory_from_xlsx
   unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2022_Inventory_from_xlsx
+  unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_2023_Inventory_from_xlsx
   unfccc_ghg_data.unfccc_reader.Republic_of_Korea.read_KOR_BUR4_from_xlsx

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2022.rst

@@ -5,3 +5,9 @@ unfccc\_ghg\_data.unfccc\_reader.Taiwan.config\_twn\_nir2022
 
 .. currentmodule:: unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2022
 
+
+
+fix\_rows
+=========
+
+.. autofunction:: fix_rows

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2023.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Taiwan.config\_twn\_nir2023
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2023
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2023

+ 6 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2023_Inventory_from_pdf.rst

@@ -0,0 +1,6 @@
+unfccc\_ghg\_data.unfccc\_reader.Taiwan.read\_TWN\_2023\_Inventory\_from\_pdf
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2023_Inventory_from_pdf
+
+.. currentmodule:: unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2023_Inventory_from_pdf

+ 2 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.Taiwan.rst

@@ -10,4 +10,6 @@ unfccc\_ghg\_data.unfccc\_reader.Taiwan
   :toctree: ./
 
   unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2022
+  unfccc_ghg_data.unfccc_reader.Taiwan.config_twn_nir2023
   unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2022_Inventory_from_pdf
+  unfccc_ghg_data.unfccc_reader.Taiwan.read_TWN_2023_Inventory_from_pdf

+ 3 - 0
docs/source/api/unfccc_ghg_data.unfccc_reader.rst

@@ -10,12 +10,15 @@ unfccc\_ghg\_data.unfccc\_reader
   :toctree: ./
 
   unfccc_ghg_data.unfccc_reader.Argentina
+  unfccc_ghg_data.unfccc_reader.Burundi
   unfccc_ghg_data.unfccc_reader.Chile
   unfccc_ghg_data.unfccc_reader.Colombia
+  unfccc_ghg_data.unfccc_reader.Guinea
   unfccc_ghg_data.unfccc_reader.Indonesia
   unfccc_ghg_data.unfccc_reader.Israel
   unfccc_ghg_data.unfccc_reader.Malaysia
   unfccc_ghg_data.unfccc_reader.Mexico
+  unfccc_ghg_data.unfccc_reader.Mongolia
   unfccc_ghg_data.unfccc_reader.Montenegro
   unfccc_ghg_data.unfccc_reader.Morocco
   unfccc_ghg_data.unfccc_reader.Nigeria

+ 11 - 3
docs/source/conf.py

@@ -10,10 +10,10 @@ from pathlib import Path
 
 from sphinxcontrib_autodocgen import AutoDocGen
 
-os.environ["UNFCCC_GHG_ROOT_PATH"] = str(Path("..") / "..")
-
 import unfccc_ghg_data
 
+os.environ["UNFCCC_GHG_ROOT_PATH"] = str(Path("..") / "..")
+
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
@@ -152,8 +152,16 @@ exec_code_example_dir = "."
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 # Pick your theme for html output, we typically use the read the docs theme
-html_theme = "sphinx_rtd_theme"
+html_theme = "sphinx_book_theme"
 html_static_path = ["_static"]
+html_theme_options = {
+    "repository_url": "https://github.com/JGuetschow/UNFCCC_non-AnnexI_data",
+    "repository_branch": "main",
+    "path_to_docs": "docs/source",
+    "use_repository_button": True,
+    "use_issues_button": True,
+    "use_edit_page_button": True,
+}
 
 
 # Ignore ipynb files when building (see https://github.com/executablebooks/MyST-NB/issues/363).

+ 31 - 3
docs/source/development.md

@@ -2,6 +2,34 @@
 # Development
 
 Notes for developers. If you want to get involved, please do!
+We welcome all kinds of contributions, for example:
+
+- docs fixes/clarifications
+- bug reports
+- bug fixes
+- feature requests
+- pull requests
+- tutorials
+
+## Workflows
+
+<!---
+This section is auto-generated by the copier template
+and the text below is just a placeholder to get you started.
+The workflows section will likely need to be updated
+to be project specific as the project's norms are established.
+-->
+
+We don't mind whether you use a branching or forking workflow.
+However, please only push to your own branches,
+pushing to other people's branches is often a recipe for disaster,
+is never required in our experience
+so is best avoided.
+
+Try and keep your merge requests as small as possible
+(focus on one thing if you can).
+This makes life much easier for reviewers
+which allows contributions to be accepted at a faster rate.
 
 ## Language
 
@@ -28,13 +56,13 @@ The steps required are the following:
 
 
 1. Bump the version: manually trigger the "bump" workflow from the main branch
-   (see here: https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/bump.yaml).
-   A valid "bump_rule" (see https://python-poetry.org/docs/cli/#version) will need to be specified.
+   (see here: [bump workflow](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/actions/workflows/bump.yaml)).
+   A valid "bump_rule" (see [poetry's docs](https://python-poetry.org/docs/cli/#version)) will need to be specified.
    This will then trigger a draft release.
 
 1. Edit the draft release which has been created
    (see here:
-   https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/releases).
+   [project releases](https://github.com/JGuetschow/UNFCCC_non-AnnexI_data/releases)).
    Once you are happy with the release (removed placeholders, added key
    announcements etc.) then hit 'Publish release'. This triggers a release to
    PyPI (which you can then add to the release if you want).

+ 0 - 2
docs/source/index.md

@@ -29,13 +29,11 @@
 :caption: Contents
 :maxdepth: 2
 usage
-notebooks
 development
 api/unfccc_ghg_data
 changelog
 ```
 
-
 Index
 -----
 

+ 0 - 23
docs/source/notebooks.md

@@ -1,23 +0,0 @@
-(notebooks-reference)=
-# Notebooks
-
-TODO
-
-Here we provide various examples of how to use Country greenhouse gas data submitted to the UNFCCC.
-They  are derived from
-[jupyter notebooks](https://docs.jupyter.org/en/latest/start/index.html),
-but are saved using [jupytext](https://jupytext.readthedocs.io/en/latest/)
-to keep our repository slim and make it easier to track changes.
-
-## Basic demos
-
-```{toctree}
-:caption: Contents
-:maxdepth: 1
-notebooks/basic-demo.py
-```
-
-## Notebook execution info
-
-```{nb-exec-table}
-```

+ 0 - 24
docs/source/notebooks/basic-demo.py

@@ -1,24 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.14.5
-#   kernelspec:
-#     display_name: Python 3 (ipykernel)
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# # Basic demo
-#
-# This notebook gives a basic demonstration of how to use Country greenhouse gas data submitted to the UNFCCC.
-
-# %%
-import unfccc_ghg_data
-
-# %%
-print(f"You are using unfccc_ghg_data version {unfccc_ghg_data.__version__}")

+ 95 - 159
poetry.lock

@@ -1,5 +1,23 @@
 # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
+[[package]]
+name = "accessible-pygments"
+version = "0.0.5"
+description = "A collection of accessible pygments styles"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7"},
+    {file = "accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872"},
+]
+
+[package.dependencies]
+pygments = ">=1.5"
+
+[package.extras]
+dev = ["pillow", "pkginfo (>=1.10)", "playwright", "pre-commit", "setuptools", "twine (>=5.0)"]
+tests = ["hypothesis", "pytest"]
+
 [[package]]
 name = "alabaster"
 version = "0.7.16"
@@ -124,73 +142,6 @@ charset-normalizer = ["charset-normalizer"]
 html5lib = ["html5lib"]
 lxml = ["lxml"]
 
-[[package]]
-name = "black"
-version = "23.3.0"
-description = "The uncompromising code formatter."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
-    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
-    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
-    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
-    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
-    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
-    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
-    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
-    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
-    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
-    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
-    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
-    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
-    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-mypy-extensions = ">=0.4.3"
-packaging = ">=22.0"
-pathspec = ">=0.9.0"
-platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
-
-[package.extras]
-colorama = ["colorama (>=0.4.3)"]
-d = ["aiohttp (>=3.7.4)"]
-jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
-uvloop = ["uvloop (>=0.15.2)"]
-
-[[package]]
-name = "blackdoc"
-version = "0.3.8"
-description = "run black on documentation code snippets"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "blackdoc-0.3.8-py3-none-any.whl", hash = "sha256:c003a1b72d57692b343815c8b7a15e78977caba96c86060def892602b5aba083"},
-    {file = "blackdoc-0.3.8.tar.gz", hash = "sha256:3c9d5534f92557a627a31550c7faec8363b5b0929bbb0ca3f5df179a81a9d6b2"},
-]
-
-[package.dependencies]
-black = "*"
-more-itertools = "*"
-pathspec = "*"
-rich = "*"
-tomli = "*"
-
 [[package]]
 name = "boto"
 version = "2.49.0"
@@ -890,13 +841,13 @@ files = [
 
 [[package]]
 name = "docutils"
-version = "0.18.1"
+version = "0.19"
 description = "Docutils -- Python Documentation Utilities"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+python-versions = ">=3.7"
 files = [
-    {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"},
-    {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"},
+    {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"},
+    {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"},
 ]
 
 [[package]]
@@ -2585,17 +2536,6 @@ toolz = "*"
 [package.extras]
 complete = ["blosc", "numpy (>=1.20.0)", "pandas (>=1.3)", "pyzmq"]
 
-[[package]]
-name = "pathspec"
-version = "0.12.1"
-description = "Utility library for gitignore style pattern matching of file paths."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"},
-    {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"},
-]
-
 [[package]]
 name = "patool"
 version = "1.15.0"
@@ -3011,6 +2951,33 @@ files = [
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
 
+[[package]]
+name = "pydata-sphinx-theme"
+version = "0.15.2"
+description = "Bootstrap-based Sphinx theme from the PyData community"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pydata_sphinx_theme-0.15.2-py3-none-any.whl", hash = "sha256:0c5fa1fa98a9b26dae590666ff576f27e26c7ba708fee754ecb9e07359ed4588"},
+    {file = "pydata_sphinx_theme-0.15.2.tar.gz", hash = "sha256:4243fee85b3afcfae9df64f83210a04e7182e53bc3db8841ffff6d21d95ae320"},
+]
+
+[package.dependencies]
+accessible-pygments = "*"
+Babel = "*"
+beautifulsoup4 = "*"
+docutils = "!=0.17.0"
+packaging = "*"
+pygments = ">=2.7"
+sphinx = ">=5.0"
+typing-extensions = "*"
+
+[package.extras]
+a11y = ["pytest-playwright"]
+dev = ["nox", "pre-commit", "pydata-sphinx-theme[doc,test]", "pyyaml"]
+doc = ["ablog (>=0.11.0rc2)", "colorama", "ipykernel", "ipyleaflet", "jupyter_sphinx", "jupyterlite-sphinx", "linkify-it-py", "matplotlib", "myst-parser", "nbsphinx", "numpy", "numpydoc", "pandas", "plotly", "rich", "sphinx-autoapi (>=3.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-favicon (>=1.0.1)", "sphinx-sitemap", "sphinx-togglebutton", "sphinxcontrib-youtube (<1.4)", "sphinxext-rediraffe", "xarray"]
+test = ["pytest", "pytest-cov", "pytest-regressions"]
+
 [[package]]
 name = "pygments"
 version = "2.18.0"
@@ -3128,13 +3095,13 @@ six = ">=1.5"
 
 [[package]]
 name = "python-gitlab"
-version = "4.4.0"
+version = "4.5.0"
 description = "A python wrapper for the GitLab API"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "python-gitlab-4.4.0.tar.gz", hash = "sha256:1d117bf7b433ae8255e5d74e72c660978f50ee85eb62248c9fb52ef43c3e3814"},
-    {file = "python_gitlab-4.4.0-py3-none-any.whl", hash = "sha256:cdad39d016f59664cdaad0f878f194c79cb4357630776caa9a92c1da25c8d986"},
+    {file = "python_gitlab-4.5.0-py3-none-any.whl", hash = "sha256:b078b63afab7624ef2084aac64e3a9f4488f55b2234017e05df1b7260169cb52"},
+    {file = "python_gitlab-4.5.0.tar.gz", hash = "sha256:0a106174949819912b9abb4232e39059f83f613177fdb1787097eb84481c64b2"},
 ]
 
 [package.dependencies]
@@ -3399,24 +3366,6 @@ files = [
 [package.dependencies]
 requests = ">=2.0.1,<3.0.0"
 
-[[package]]
-name = "rich"
-version = "13.7.1"
-description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-optional = false
-python-versions = ">=3.7.0"
-files = [
-    {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"},
-    {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"},
-]
-
-[package.dependencies]
-markdown-it-py = ">=2.2.0"
-pygments = ">=2.13.0,<3.0.0"
-
-[package.extras]
-jupyter = ["ipywidgets (>=7.5.1,<9)"]
-
 [[package]]
 name = "rpds-py"
 version = "0.18.1"
@@ -3604,28 +3553,28 @@ files = [
 
 [[package]]
 name = "ruff"
-version = "0.0.264"
-description = "An extremely fast Python linter, written in Rust."
+version = "0.1.15"
+description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.0.264-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:ec2fa192c035b8b68cc2b91049c561cd69543e2b8c4d157d9aa7727320bedcca"},
-    {file = "ruff-0.0.264-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d97ba8db0fb601ffe9ee996ebb97c698e427a2fd4514fefbe7b803111354f783"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4564e0f245eb515c6ed63988c21e9c40bcfd485cd1ec63bdd790f9a81d301f15"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:323ae6c1702b26c96d0fbf939c5959c37e79021f86b70f63634df918bc77f36e"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18a29ed37bf8cfe6dce8a2db56c313a64c0804095108753621f3c3321e0c9c5f"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:d628de91e2be7a83128526636097d2dd890669a06143f826f6c591d79aeefbc4"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91c6eb4f979b661a2dd850d9ac803842bb7b66d4926de84f09c787af82590f73"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04ec5d75e4bca754cedd20d53e2ba4920d6259e7579abfb2e8e30c3c80e41b17"},
-    {file = "ruff-0.0.264-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71fd865ebacc1083259b3fb7e3eb45235a86e62e21830b8a6b067be0ec54aa2e"},
-    {file = "ruff-0.0.264-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cd4f60ffc3eb15802c554a9c8581bf2117c4d3d06fbc57e0ba58f04cb1aaa47f"},
-    {file = "ruff-0.0.264-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:05ee163a046fc593d150179d23f4af447fb82f3e59cd34e031ea0868c65bb8e8"},
-    {file = "ruff-0.0.264-py3-none-musllinux_1_2_i686.whl", hash = "sha256:484e395d1984ab9e1e66bd42e7a5192decfee86998d07d36ee50b2fadccc8734"},
-    {file = "ruff-0.0.264-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:67326fdc9ac0a1b13e229c6e24e8d115863c52cd710faaaaa588851535281d6c"},
-    {file = "ruff-0.0.264-py3-none-win32.whl", hash = "sha256:5a8658ebcc37d62f72840cbdf564171c1a2b6831db482b4d917962541a2f4a44"},
-    {file = "ruff-0.0.264-py3-none-win_amd64.whl", hash = "sha256:068a82a29d80848a56e3d9d4308e6e0ca8b2ecdaf5ac342a292545a59b7f2c21"},
-    {file = "ruff-0.0.264-py3-none-win_arm64.whl", hash = "sha256:3e2c38449548e122f2612843a7c04e22b4fd491656955c57b8cb05df11639ad6"},
-    {file = "ruff-0.0.264.tar.gz", hash = "sha256:8fcd4b693ca1374eb7a5796581c90689f884f98f388740d94f0702fd30f8f78f"},
+    {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5fe8d54df166ecc24106db7dd6a68d44852d14eb0729ea4672bb4d96c320b7df"},
+    {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f0bfbb53c4b4de117ac4d6ddfd33aa5fc31beeaa21d23c45c6dd249faf9126f"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d432aec35bfc0d800d4f70eba26e23a352386be3a6cf157083d18f6f5881c8"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9405fa9ac0e97f35aaddf185a1be194a589424b8713e3b97b762336ec79ff807"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66ec24fe36841636e814b8f90f572a8c0cb0e54d8b5c2d0e300d28a0d7bffec"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6f8ad828f01e8dd32cc58bc28375150171d198491fc901f6f98d2a39ba8e3ff5"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86811954eec63e9ea162af0ffa9f8d09088bab51b7438e8b6488b9401863c25e"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd4025ac5e87d9b80e1f300207eb2fd099ff8200fa2320d7dc066a3f4622dc6b"},
+    {file = "ruff-0.1.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b17b93c02cdb6aeb696effecea1095ac93f3884a49a554a9afa76bb125c114c1"},
+    {file = "ruff-0.1.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ddb87643be40f034e97e97f5bc2ef7ce39de20e34608f3f829db727a93fb82c5"},
+    {file = "ruff-0.1.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abf4822129ed3a5ce54383d5f0e964e7fef74a41e48eb1dfad404151efc130a2"},
+    {file = "ruff-0.1.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c629cf64bacfd136c07c78ac10a54578ec9d1bd2a9d395efbee0935868bf852"},
+    {file = "ruff-0.1.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1bab866aafb53da39c2cadfb8e1c4550ac5340bb40300083eb8967ba25481447"},
+    {file = "ruff-0.1.15-py3-none-win32.whl", hash = "sha256:2417e1cb6e2068389b07e6fa74c306b2810fe3ee3476d5b8a96616633f40d14f"},
+    {file = "ruff-0.1.15-py3-none-win_amd64.whl", hash = "sha256:3837ac73d869efc4182d9036b1405ef4c73d9b1f88da2413875e34e0d6919587"},
+    {file = "ruff-0.1.15-py3-none-win_arm64.whl", hash = "sha256:9a933dfb1c14ec7a33cceb1e49ec4a16b51ce3c20fd42663198746efc0427360"},
+    {file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"},
 ]
 
 [[package]]
@@ -3856,6 +3805,26 @@ files = [
 [package.dependencies]
 sphinx = "*"
 
+[[package]]
+name = "sphinx-book-theme"
+version = "1.1.2"
+description = "A clean book theme for scientific explanations and documentation with Sphinx"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "sphinx_book_theme-1.1.2-py3-none-any.whl", hash = "sha256:cee744466fde48f50302b851291b208aa67e726ca31b7a3bfb9b6e6a145663e0"},
+    {file = "sphinx_book_theme-1.1.2.tar.gz", hash = "sha256:7f3abcd146ca82e6f39d6db53711102b1c1d328d12f65e3e47ad9bf842614a49"},
+]
+
+[package.dependencies]
+pydata-sphinx-theme = ">=0.14"
+sphinx = ">=5"
+
+[package.extras]
+code-style = ["pre-commit"]
+doc = ["ablog", "folium", "ipywidgets", "matplotlib", "myst-nb", "nbclient", "numpy", "numpydoc", "pandas", "plotly", "sphinx-copybutton", "sphinx-design", "sphinx-examples", "sphinx-tabs", "sphinx-thebe", "sphinx-togglebutton", "sphinxcontrib-bibtex", "sphinxcontrib-youtube", "sphinxext-opengraph"]
+test = ["beautifulsoup4", "coverage", "myst-nb", "pytest", "pytest-cov", "pytest-regressions", "sphinx_thebe"]
+
 [[package]]
 name = "sphinx-copybutton"
 version = "0.5.2"
@@ -3885,25 +3854,6 @@ files = [
     {file = "sphinx_exec_code-0.10-py3-none-any.whl", hash = "sha256:2597460a7062bfd8ef1b108a8cec1dc10250d56a19034830b038ac653dee1902"},
 ]
 
-[[package]]
-name = "sphinx-rtd-theme"
-version = "1.3.0"
-description = "Read the Docs theme for Sphinx"
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
-files = [
-    {file = "sphinx_rtd_theme-1.3.0-py2.py3-none-any.whl", hash = "sha256:46ddef89cc2416a81ecfbeaceab1881948c014b1b6e4450b815311a89fb977b0"},
-    {file = "sphinx_rtd_theme-1.3.0.tar.gz", hash = "sha256:590b030c7abb9cf038ec053b95e5380b5c70d61591eb0b552063fbe7c41f0931"},
-]
-
-[package.dependencies]
-docutils = "<0.19"
-sphinx = ">=1.6,<8"
-sphinxcontrib-jquery = ">=4,<5"
-
-[package.extras]
-dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client", "wheel"]
-
 [[package]]
 name = "sphinxcontrib-applehelp"
 version = "1.0.8"
@@ -3952,20 +3902,6 @@ lint = ["docutils-stubs", "flake8", "mypy"]
 standalone = ["Sphinx (>=5)"]
 test = ["html5lib", "pytest"]
 
-[[package]]
-name = "sphinxcontrib-jquery"
-version = "4.1"
-description = "Extension to include jQuery on newer Sphinx releases"
-optional = false
-python-versions = ">=2.7"
-files = [
-    {file = "sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a"},
-    {file = "sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"},
-]
-
-[package.dependencies]
-Sphinx = ">=1.8"
-
 [[package]]
 name = "sphinxcontrib-jsmath"
 version = "1.0.1"
@@ -4372,13 +4308,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "virtualenv"
-version = "20.26.1"
+version = "20.26.2"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.26.1-py3-none-any.whl", hash = "sha256:7aa9982a728ae5892558bff6a2839c00b9ed145523ece2274fad6f414690ae75"},
-    {file = "virtualenv-20.26.1.tar.gz", hash = "sha256:604bfdceaeece392802e6ae48e69cec49168b9c5f4a44e483963f9242eb0e78b"},
+    {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"},
+    {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"},
 ]
 
 [package.dependencies]
@@ -4474,4 +4410,4 @@ plots = ["matplotlib"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9, <3.11"
-content-hash = "a472b22884244e485f3bab9e73dc6baf95b8f3714a0e2ad19b25734de61971f9"
+content-hash = "5e7c15209b92f58e81b3190530856f6e4cf1b4b427c05d333b636ca51965439c"

+ 9 - 7
pyproject.toml

@@ -5,6 +5,8 @@ description = "Reading country greenhouse gas data submitted to the United Natio
 authors = ["Johannes Gütschow <mail@johannes-guetschow.de>"]
 readme = "README.md"
 packages = [{include = "unfccc_ghg_data", from = "src"}]
+license = "TBD"
+include = ["LICENCE"]  # poetry uses US English so assumes it will be spelt LICENSE
 
 [tool.poetry.dependencies]
 python = ">=3.9, <3.11"
@@ -32,7 +34,7 @@ pytest = "^7.3.1"
 
 [tool.poetry.group.docs.dependencies]
 myst-nb = "^0.17.0"
-sphinx-rtd-theme = "^1.2.0"
+sphinx-book-theme = "^1.1.0"
 sphinx-autodoc-typehints = "^1.23.0"
 sphinx-autodocgen = "^1.3"
 jupytext = "^1.14.5"
@@ -41,10 +43,8 @@ sphinx-copybutton = "^0.5.2"
 [tool.poetry.group.dev.dependencies]
 pytest-cov = "^4.0.0"
 coverage = "^7.2.0"
-black = "23.3.0"
-blackdoc = "0.3.8"
 mypy = "^1.2.0"
-ruff = "0.0.264"
+ruff = "^0.1.8"
 pre-commit = "^3.3.1"
 towncrier = "^23.6.0"
 liccheck = "^0.9.1"
@@ -113,8 +113,10 @@ ignore = [
     "D200",  # One-line docstring should fit on one line with quotes
     "D400",  # First line should end with a period
 ]
-# Provide some leeway for long docstring, this is otherwise handled by black
-line-length = 110
+line-length = 88
+
+[tool.ruff.format]
+docstring-code-format = true
 
 [tool.ruff.per-file-ignores]
 "test*.py" = [
@@ -138,7 +140,7 @@ known-first-party = ["src"]
 convention = "numpy"
 
 [tool.towncrier]
-package = "unfccc-ghg-data"
+package = "unfccc_ghg_data"
 package_dir = "src"
 filename = "docs/source/changelog.md"
 directory = "changelog/"

+ 131 - 125
src/unfccc_ghg_data/unfccc_downloader/download_btr.py

@@ -28,141 +28,147 @@ from unfccc_ghg_data.unfccc_downloader import get_BTR_name_and_URL
 # python-selenium-firefox-driver-dismiss-open-save-file-popup
 ###############
 
-descr = (
-    "Download and unzip data from UNFCCC Biannial Transparency Reports Submissions. "
-    "Based on download.py from national-inventory-submissions "
-    "(https://github.com/openclimatedata/national-inventory-submisions)"
-)
-parser = argparse.ArgumentParser(description=descr)
-
-parser.add_argument("--round", help="Submission round to download, e.g. 1")
-
-args = parser.parse_args()
-submission_round = int(args.round)
-
-round_name, url = get_BTR_name_and_URL(submission_round)
-dataset = f"BTR{submission_round}"
-
-print(f"Downloading data for {round_name} BTRs")
-
-error_file_sizes = [212, 210]
-
-# Read submissions list
-submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{dataset}.csv")
-
-# set options for headless mode
-profile_path = ".firefox"
-options = Options()
-# options.add_argument('-headless')
-
-# create profile for headless mode and automatic downloading
-options.set_preference("profile", profile_path)
-options.set_preference("browser.download.folderList", 2)
-
-# set up selenium driver
-driver = Firefox(options=options)
-# visit the main data page once to create cookies
-driver.get(url)
-
-# wait a bit for the website to load before we get the cookies
-time.sleep(20)
-
-# get the session id cookie
-cookies_selenium = driver.get_cookies()
-cookies = {}
-for cookie in cookies_selenium:
-    cookies[cookie["name"]] = cookie["value"]
-
-new_downloaded = []
-
-for idx, submission in submissions.iterrows():
-    print("=" * 60)
-    title = submission.Title
-    url = submission.URL
-    country = submission.Country
-    country = country.replace(" ", "_")
-    print(f"Downloading {title} from {url}")
-
-    country_folder = downloaded_data_path_UNFCCC / country
-    if not country_folder.exists():
-        country_folder.mkdir()
-    local_filename = (
-        country_folder
-        / dataset
-        / url.split("/")[-1].replace("%20", "_").replace(" ", "_")
+if __name__ == "__main__":
+    descr = (
+        "Download and unzip data from UNFCCC Biannial Transparency Reports Submissions."
+        " Based on download.py from national-inventory-submissions "
+        "(https://github.com/openclimatedata/national-inventory-submisions)"
     )
-    if not local_filename.parent.exists():
-        local_filename.parent.mkdir()
-
-    if local_filename.exists():
-        # check file size. if 210 or 212 bytes it's the error page
-        if Path(local_filename).stat().st_size in error_file_sizes:
-            # found the error page. delete file
-            os.remove(local_filename)
-
-    # now we have removed error pages, so a present file should not be overwritten
-    if (not local_filename.exists()) and (not local_filename.is_symlink()):
-        i = 0  # reset counter
-        while not local_filename.exists() and i < 10:  # noqa: PLR2004
-            # for i = 0 and i = 5 try to get a new session ID
-            if i == 1 or i == 5:  # noqa: PLR2004
-                driver = Firefox(options=options)
-
-                # visit the main data page once to create cookies
-                driver.get(url)
-                time.sleep(20)
-
-                # get the session id cookie
-                cookies_selenium = driver.get_cookies()
-                cookies = {}
-                for cookie in cookies_selenium:
-                    cookies[cookie["name"]] = cookie["value"]
-
-            r = requests.get(url, stream=True, cookies=cookies)  # noqa: S113
-            with open(str(local_filename), "wb") as f:
-                shutil.copyfileobj(r.raw, f)
+    parser = argparse.ArgumentParser(description=descr)
 
+    parser.add_argument("--round", help="Submission round to download, " "e.g. 1")
+
+    args = parser.parse_args()
+    submission_round = int(args.round)
+
+    round_name, url = get_BTR_name_and_URL(submission_round)
+    dataset = f"BTR{submission_round}"
+
+    print(f"Downloading data for {round_name} BTRs")
+
+    error_file_sizes = [212, 210]
+
+    # Read submissions list
+    submissions = pd.read_csv(
+        downloaded_data_path_UNFCCC / f"submissions-{dataset}.csv"
+    )
+
+    # set options for headless mode
+    profile_path = ".firefox"
+    options = Options()
+    # options.add_argument('-headless')
+
+    # create profile for headless mode and automatic downloading
+    options.set_preference("profile", profile_path)
+    options.set_preference("browser.download.folderList", 2)
+
+    # set up selenium driver
+    driver = Firefox(options=options)
+    # visit the main data page once to create cookies
+    driver.get(url)
+
+    # wait a bit for the website to load before we get the cookies
+    time.sleep(20)
+
+    # get the session id cookie
+    cookies_selenium = driver.get_cookies()
+    cookies = {}
+    for cookie in cookies_selenium:
+        cookies[cookie["name"]] = cookie["value"]
+
+    new_downloaded = []
+
+    for idx, submission in submissions.iterrows():
+        print("=" * 60)
+        title = submission.Title
+        url = submission.URL
+        country = submission.Country
+        country = country.replace(" ", "_")
+        print(f"Downloading {title} from {url}")
+
+        country_folder = downloaded_data_path_UNFCCC / country
+        if not country_folder.exists():
+            country_folder.mkdir()
+        local_filename = (
+            country_folder
+            / dataset
+            / url.split("/")[-1].replace("%20", "_").replace(" ", "_")
+        )
+        if not local_filename.parent.exists():
+            local_filename.parent.mkdir()
+
+        if local_filename.exists():
             # check file size. if 210 or 212 bytes it's the error page
             if Path(local_filename).stat().st_size in error_file_sizes:
                 # found the error page. delete file
                 os.remove(local_filename)
 
-            # sleep a bit to avoid running into captchas
-            time.sleep(randrange(5, 15))  # noqa: S311
-
-        if local_filename.exists():
-            new_downloaded.append(submission)
-            print(f"Download => {local_filename.relative_to(root_path)}")
-            # unzip data (only for new downloads)
-            if local_filename.suffix == ".zip":
-                try:
-                    zipped_file = zipfile.ZipFile(str(local_filename), "r")
-                    zipped_file.extractall(str(local_filename.parent))
-                    print(f"Extracted {len(zipped_file.namelist())} files.")
-                    zipped_file.close()
-                # TODO Better error logging/visibilty
-                except zipfile.BadZipFile:
+        # now we have removed error pages, so a present file should not be overwritten
+        if (not local_filename.exists()) and (not local_filename.is_symlink()):
+            i = 0  # reset counter
+            while not local_filename.exists() and i < 10:  # noqa: PLR2004
+                # for i = 0 and i = 5 try to get a new session ID
+                if i in (1, 5):
+                    driver = Firefox(options=options)
+
+                    # visit the main data page once to create cookies
+                    driver.get(url)
+                    time.sleep(20)
+
+                    # get the session id cookie
+                    cookies_selenium = driver.get_cookies()
+                    cookies = {}
+                    for cookie in cookies_selenium:
+                        cookies[cookie["name"]] = cookie["value"]
+
+                r = requests.get(url, stream=True, cookies=cookies)  # noqa: S113
+                with open(str(local_filename), "wb") as f:
+                    shutil.copyfileobj(r.raw, f)
+
+                # check file size. if 210 or 212 bytes it's the error page
+                if Path(local_filename).stat().st_size in error_file_sizes:
+                    # found the error page. delete file
+                    os.remove(local_filename)
+
+                # sleep a bit to avoid running into captchas
+                time.sleep(randrange(5, 15))  # noqa: S311
+
+            if local_filename.exists():
+                new_downloaded.append(submission)
+                print(f"Download => {local_filename.relative_to(root_path)}")
+                # unzip data (only for new downloads)
+                if local_filename.suffix == ".zip":
+                    try:
+                        zipped_file = zipfile.ZipFile(str(local_filename), "r")
+                        zipped_file.extractall(str(local_filename.parent))
+                        print(f"Extracted {len(zipped_file.namelist())} files.")
+                        zipped_file.close()
+                    # TODO Better error logging/visibilty
+                    except zipfile.BadZipFile:
+                        print(
+                            f"Error while trying to extract "
+                            f"{local_filename.relative_to(root_path)}"
+                        )
+                    except NotImplementedError:
+                        print(
+                            "Zip format not supported, please unzip on the command "
+                            "line."
+                        )
+                else:
                     print(
-                        f"Error while trying to extract "
-                        f"{local_filename.relative_to(root_path)}"
+                        f"Not attempting to extract "
+                        f"{local_filename.relative_to(root_path)}."
                     )
-                except NotImplementedError:
-                    print("Zip format not supported, please unzip on the command line.")
             else:
-                print(
-                    f"Not attempting to extract "
-                    f"{local_filename.relative_to(root_path)}."
-                )
-        else:
-            print(f"Failed to download {local_filename.relative_to(root_path)}")
+                print(f"Failed to download {local_filename.relative_to(root_path)}")
 
-    else:
-        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
+        else:
+            print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
 
-driver.close()
+    driver.close()
 
-df_new_downloads = pd.DataFrame(new_downloaded)
-df_new_downloads.to_csv(
-    downloaded_data_path_UNFCCC / f"00_new_downloads_{dataset}-{date.today()}.csv",
-    index=False,
-)
+    df_new_downloads = pd.DataFrame(new_downloaded)
+    df_new_downloads.to_csv(
+        downloaded_data_path_UNFCCC / f"00_new_downloads_{dataset}-{date.today()}.csv",
+        index=False,
+    )

+ 84 - 81
src/unfccc_ghg_data/unfccc_downloader/fetch_submissions_btr.py

@@ -17,90 +17,93 @@ from unfccc_ghg_data.unfccc_downloader import (
     get_unfccc_submission_info,
 )
 
-max_tries = 10
-
-descr = (
-    "Download UNFCCC Biannial Transparency Reports Submissions lists "
-    "and create list of submissions as CSV file. Based on "
-    "process.py from national-inventory-submissions "
-    "(https://github.com/openclimatedata/national-inventory-submisions)"
-)
-parser = argparse.ArgumentParser(description=descr)
-parser.add_argument("--round", help="1 for first BTRs, 2 for second BTRs etc.")
-
-args = parser.parse_args()
-submission_round = int(args.round)
-
-round_name, url = get_BTR_name_and_URL(submission_round)
-
-print(f"Fetching submissions for {round_name} BTRs")
-print(f"Using {url} to get submissions list")
-
-# set options for headless mode
-profile_path = ".firefox"
-options = Options()
-options.add_argument("-headless")
+if __name__ == "__main__":
+    max_tries = 10
+
+    descr = (
+        "Download UNFCCC Biannial Transparency Reports Submissions lists "
+        "and create list of submissions as CSV file. Based on "
+        "process.py from national-inventory-submissions "
+        "(https://github.com/openclimatedata/national-inventory-submisions)"
+    )
+    parser = argparse.ArgumentParser(description=descr)
+    parser.add_argument("--round", help="1 for first BTRs, 2 for second BTRs etc.")
+
+    args = parser.parse_args()
+    submission_round = int(args.round)
+
+    round_name, url = get_BTR_name_and_URL(submission_round)
+
+    print(f"Fetching submissions for {round_name} BTRs")
+    print(f"Using {url} to get submissions list")
+
+    # set options for headless mode
+    profile_path = ".firefox"
+    options = Options()
+    options.add_argument("-headless")
+
+    # create profile for headless mode and automatic downloading
+    options.set_preference("profile", profile_path)
+
+    # set up selenium driver
+    driver = Firefox(options=options)
+    driver.get(url)
+
+    html = BeautifulSoup(driver.page_source, "html.parser")
+
+    table = html.find("table")
+
+    # check if table found. if not the get command didn't work, likely because of a
+    # captcha on the site
+    ### TODO replace by error message
+    if not table:
+        raise RuntimeError(  # noqa: TRY003
+            "No table found on URL. Possibly due to a captcha."
+        )
+
+    links = table.findAll("a")
+
+    targets = []  # sub-pages
+    downloads = []
+    no_downloads = []
+
+    # Check links for Zipfiles or subpages
+    for link in links:
+        if "href" not in link.attrs:
+            continue
+        href = link.attrs["href"]
+        if "/documents/" in href:
+            if "title" in link.attrs.keys():
+                title = link.attrs["title"]
+            else:
+                title = link.contents[0]
+            if href.startswith("/documents"):
+                href = "https://unfccc.int" + href
+            # Only add pages in the format https://unfccc.int/documents/65587
+            # to further downloads
+            if str(Path(href).parent).endswith("documents"):
+                targets.append({"title": title, "url": href})
+        else:
+            print(f"Ignored link: {href}: not in the right format.")
 
-# create profile for headless mode and automatic downloading
-options.set_preference("profile", profile_path)
+    # Go through sub-pages.
+    for target in targets:
+        time.sleep(randrange(5, 15))  # noqa: S311
+        url = target["url"]
 
-# set up selenium driver
-driver = Firefox(options=options)
-driver.get(url)
+        submission_info = get_unfccc_submission_info(url, driver, max_tries=max_tries)
 
-html = BeautifulSoup(driver.page_source, "html.parser")
+        if submission_info:
+            downloads = downloads + submission_info
+        else:
+            no_downloads.append({target["title"], url})
 
-table = html.find("table")
+    if len(no_downloads) > 0:
+        print("No downloads for ", no_downloads)
 
-# check if table found. if not the get command didn't work, likely because of a captcha on the site
-### TODO replace by error message
-if not table:
-    raise RuntimeError(  # noqa: TRY003
-        "No table found on URL. Possibly due to a captcha."
+    driver.close()
+    df_downloads = pd.DataFrame(downloads)
+    df_downloads.to_csv(
+        downloaded_data_path_UNFCCC / f"submissions-BTR{submission_round}.csv",
+        index=False,
     )
-
-links = table.findAll("a")
-
-targets = []  # sub-pages
-downloads = []
-no_downloads = []
-
-# Check links for Zipfiles or subpages
-for link in links:
-    if "href" not in link.attrs:
-        continue
-    href = link.attrs["href"]
-    if "/documents/" in href:
-        if "title" in link.attrs.keys():
-            title = link.attrs["title"]
-        else:
-            title = link.contents[0]
-        if href.startswith("/documents"):
-            href = "https://unfccc.int" + href
-        # Only add pages in the format https://unfccc.int/documents/65587
-        # to further downloads
-        if str(Path(href).parent).endswith("documents"):
-            targets.append({"title": title, "url": href})
-    else:
-        print(f"Ignored link: {href}: not in the right format.")
-
-# Go through sub-pages.
-for target in targets:
-    time.sleep(randrange(5, 15))  # noqa: S311
-    url = target["url"]
-
-    submission_info = get_unfccc_submission_info(url, driver, max_tries=max_tries)
-
-    if submission_info:
-        downloads = downloads + submission_info
-    else:
-        no_downloads.append({target["title"], url})
-
-if len(no_downloads) > 0:
-    print("No downloads for ", no_downloads)
-
-driver.close()
-df_downloads = pd.DataFrame(downloads)
-df_downloads.to_csv(
-    downloaded_data_path_UNFCCC / f"submissions-BTR{submission_round}.csv", index=False
-)

+ 5 - 5
src/unfccc_ghg_data/unfccc_downloader/unfccc_submission_info.py

@@ -34,10 +34,10 @@ def get_unfccc_submission_info(  # noqa: PLR0912, PLR0915
     -------
     A list with information for each downloadable files linked on the submission page.
     For each file the information ins stored in a dict with the fields
-        "Kind": kind,
-        "Country": country,
-        "Title": title,
-        "URL": file,
+    "Kind": kind,
+    "Country": country,
+    "Title": title,
+    "URL": file,
 
     """
     info = []
@@ -143,7 +143,7 @@ def get_unfccc_submission_info(  # noqa: PLR0912, PLR0915
 
 def get_BTR_name_and_URL(submission_round: int) -> (str, str):
     """
-        Get the name and URL of a BTR for a given number
+    Get the name and URL of a BTR for a given number
 
     Parameters
     ----------

+ 108 - 105
src/unfccc_ghg_data/unfccc_reader/Argentina/read_ARG_BUR5_from_csv.py

@@ -13,7 +13,8 @@ license probably CC-BY 4.0
 * Data are in long format. Columns needed are
 'año' 'id_ipcc' 'tipo_de_gas' 'valor_en_toneladas_de_gas'
 * columns to irgnore are
-columns_to_ignore = ['sector', 'actividad', 'subactividad', 'categoria', 'valor_en_toneladas_de_co2e']
+columns_to_ignore = ['sector', 'actividad', 'subactividad', 'categoria',
+'valor_en_toneladas_de_co2e']
 * sector codes are in primap1 format (no dots), reading should be possible
 directly from CSV into interchange format
 * postprocessing needed is aggregation of gas baskets and categories as only
@@ -23,7 +24,16 @@ the highest detail categories are present
 
 import pandas as pd
 import primap2 as pm2
-from config_arg_bur5 import (
+
+from unfccc_ghg_data.helper import (
+    compression,
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
+)
+
+from .config_arg_bur5 import (
     cats_to_agg,
     coords_cols,
     coords_defaults,
@@ -37,106 +47,99 @@ from config_arg_bur5 import (
     unit,
 )
 
-from unfccc_ghg_data.helper import (
-    compression,
-    downloaded_data_path,
-    extracted_data_path,
-    gas_baskets,
-    process_data_for_country,
-)
-
-# ###
-# configuration
-# ###
-
-# folders and files
-input_folder = downloaded_data_path / "UNFCCC" / "Argentina" / "BUR5"
-output_folder = extracted_data_path / "UNFCCC" / "Argentina"
-if not output_folder.exists():
-    output_folder.mkdir()
-
-output_filename = "ARG_BUR5_2023_"
-
-csv_file = "emisiones_gei_inventario_datos_totales_1990_2020.csv"
-
-
-# read the data
-data_pd = pd.read_csv(
-    input_folder / csv_file,
-    sep=";",
-    parse_dates=[coords_cols["time"]],
-    usecols=list(coords_cols.values()),
-)
-
-data_pd["unit"] = unit
-coords_cols["unit"] = "unit"
-
-data_if = pm2.pm2io.convert_long_dataframe_if(
-    data_pd,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_value_mapping=coords_value_mapping,
-    coords_value_filling=coords_value_filling,
-    coords_terminologies=coords_terminologies,
-    filter_remove=filter_remove,
-    filter_keep=filter_keep,
-    meta_data=meta_data,
-    time_format=time_format,
-)
-
-data_pm2 = pm2.pm2io.from_interchange_format(data_if)
-data_if = data_pm2.pr.to_interchange_format()
-
-# ###
-# save data to IF and native format
-# ###
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-    data_if,
-)
-
-data_pm2 = pm2.pm2io.from_interchange_format(data_if)
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    output_folder
-    / (output_filename + coords_terminologies["category"] + "_raw" + ".nc"),
-    encoding=encoding,
-)
-
-### processing
-data_proc_pm2 = data_pm2
-
-# actual processing
-country_processing = {
-    "aggregate_cats": cats_to_agg,
-}
-data_proc_pm2 = process_data_for_country(
-    data_proc_pm2,
-    entities_to_ignore=[],
-    gas_baskets=gas_baskets,
-    processing_info_country=country_processing,
-)
-
-# adapt source and metadata
-current_source = data_proc_pm2.coords["source"].to_numpy()[0]
-data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
-data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
-data_proc_pm2 = data_proc_pm2.pr.loc[{"source": ["BUR_NIR"]}]
-
-# ###
-# save data to IF and native format
-# ###
-data_proc_if = data_proc_pm2.pr.to_interchange_format()
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"]), data_proc_if
-)
-
-encoding = {var: compression for var in data_proc_pm2.data_vars}
-data_proc_pm2.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-    encoding=encoding,
-)
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+
+    # folders and files
+    input_folder = downloaded_data_path / "UNFCCC" / "Argentina" / "BUR5"
+    output_folder = extracted_data_path / "UNFCCC" / "Argentina"
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    output_filename = "ARG_BUR5_2023_"
+
+    csv_file = "emisiones_gei_inventario_datos_totales_1990_2020.csv"
+
+    # read the data
+    data_pd = pd.read_csv(
+        input_folder / csv_file,
+        sep=";",
+        parse_dates=[coords_cols["time"]],
+        usecols=list(coords_cols.values()),
+    )
+
+    data_pd["unit"] = unit
+    coords_cols["unit"] = "unit"
+
+    data_if = pm2.pm2io.convert_long_dataframe_if(
+        data_pd,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_value_mapping=coords_value_mapping,
+        coords_value_filling=coords_value_filling,
+        coords_terminologies=coords_terminologies,
+        filter_remove=filter_remove,
+        filter_keep=filter_keep,
+        meta_data=meta_data,
+        time_format=time_format,
+    )
+
+    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+    data_if = data_pm2.pr.to_interchange_format()
+
+    # ###
+    # save data to IF and native format
+    # ###
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
+
+    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw" + ".nc"),
+        encoding=encoding,
+    )
+
+    ### processing
+    data_proc_pm2 = data_pm2
+
+    # actual processing
+    country_processing = {
+        "aggregate_cats": cats_to_agg,
+    }
+    data_proc_pm2 = process_data_for_country(
+        data_proc_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        processing_info_country=country_processing,
+    )
+
+    # adapt source and metadata
+    current_source = data_proc_pm2.coords["source"].to_numpy()[0]
+    data_temp = data_proc_pm2.pr.loc[{"source": current_source}]
+    data_proc_pm2 = data_proc_pm2.pr.set("source", "BUR_NIR", data_temp)
+    data_proc_pm2 = data_proc_pm2.pr.loc[{"source": ["BUR_NIR"]}]
+
+    # ###
+    # save data to IF and native format
+    # ###
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]),
+        data_proc_if,
+    )
+
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 178 - 178
src/unfccc_ghg_data/unfccc_reader/Burundi/read_BDI_BUR1_from_pdf.py

@@ -4,7 +4,11 @@ Read Burundi's BUR1 from pdf
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_bdi_bur1 import (
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+from unfccc_ghg_data.helper.functions import process_data_for_country
+
+from .config_bdi_bur1 import (
     coords_cols,
     coords_defaults,
     coords_terminologies,
@@ -17,212 +21,208 @@ from config_bdi_bur1 import (
     meta_data,
 )
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-from unfccc_ghg_data.helper.functions import process_data_for_country
-
-# ###
-# configuration
-# ###
-
-input_folder = downloaded_data_path / "UNFCCC" / "Burundi" / "BUR1"
-output_folder = extracted_data_path / "UNFCCC" / "Burundi"
-
-if not output_folder.exists():
-    output_folder.mkdir()
-
-pdf_file = "Burundi_BUR_1_Report__Francais.pdf"
-output_filename = "BDI_BUR1_2023_"
-category_column = f"category ({coords_terminologies['category']})"
-compression = dict(zlib=True, complevel=9)
-
-# ###
-# 1. Read in tables
-# ###
-
-df_all = None
-for year in inv_conf_per_year.keys():
-    print("-" * 60)
-    print(f"Reading year {year}.")
-    print("-" * 60)
-    df_year = None
-    for page in inv_conf_per_year[year]["pages_to_read"]:
-        print(f"Reading table from page {page}.")
-        tables_inventory_original = camelot.read_pdf(
-            str(input_folder / pdf_file),
-            pages=page,
-            flavor="lattice",
-            split_text=True,
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+
+    input_folder = downloaded_data_path / "UNFCCC" / "Burundi" / "BUR1"
+    output_folder = extracted_data_path / "UNFCCC" / "Burundi"
+
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    pdf_file = "Burundi_BUR_1_Report__Francais.pdf"
+    output_filename = "BDI_BUR1_2023_"
+    category_column = f"category ({coords_terminologies['category']})"
+    compression = dict(zlib=True, complevel=9)
+
+    # ###
+    # 1. Read in tables
+    # ###
+
+    df_all = None
+    for year in inv_conf_per_year.keys():
+        print("-" * 60)
+        print(f"Reading year {year}.")
+        print("-" * 60)
+        df_year = None
+        for page in inv_conf_per_year[year]["pages_to_read"]:
+            print(f"Reading table from page {page}.")
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                flavor="lattice",
+                split_text=True,
+            )
+            print("Reading complete.")
+
+            df_page = tables_inventory_original[0].df
+
+            if df_year is None:
+                df_year = df_page
+            else:
+                df_year = pd.concat(
+                    [df_year, df_page],
+                    axis=0,
+                    join="outer",
+                ).reset_index(drop=True)
+
+        print(f"Concatenating all tables for {year}.")
+        # remove line breaks
+        for column in df_year.columns:
+            df_year[column] = df_year[column].str.replace("\n", "")
+
+        # fix broken values in cells
+        if "fix_values" in inv_conf_per_year[year].keys():
+            for index, column, value in inv_conf_per_year[year]["fix_values"]:
+                df_year.loc[index, column] = value
+
+        # delete extra columns
+        if "delete_columns" in inv_conf_per_year[year].keys():
+            for column in inv_conf_per_year[year]["delete_columns"]:
+                df_year = df_year.drop(columns=column)
+            df_year.columns = range(df_year.columns.size)
+
+        df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+
+        df_year = pd.concat([df_header, df_year[2:]], axis=0, join="outer").reset_index(
+            drop=True
         )
-        print("Reading complete.")
 
-        df_page = tables_inventory_original[0].df
+        df_year = pm2.pm2io.nir_add_unit_information(
+            df_year,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
+        )
 
-        if df_year is None:
-            df_year = df_page
-        else:
-            df_year = pd.concat(
-                [df_year, df_page],
-                axis=0,
-                join="outer",
-            ).reset_index(drop=True)
+        print("Added unit information.")
 
-    print(f"Concatenating all tables for {year}.")
-    # remove line breaks
-    for column in df_year.columns:
-        df_year[column] = df_year[column].str.replace("\n", "")
+        # set index
+        df_year = df_year.set_index(inv_conf["index_cols"])
 
-    # fix broken values in cells
-    if "fix_values" in inv_conf_per_year[year].keys():
-        for index, column, value in inv_conf_per_year[year]["fix_values"]:
-            df_year.loc[index, column] = value
+        # convert to long format
+        df_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_year, year, inv_conf["header_long"]
+        )
 
-    # delete extra columns
-    if "delete_columns" in inv_conf_per_year[year].keys():
-        for column in inv_conf_per_year[year]["delete_columns"]:
-            df_year = df_year.drop(columns=column)
-        df_year.columns = range(df_year.columns.size)
+        # extract from tuple
+        df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
 
-    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+        # prep for conversion to PM2 IF and native format
+        # make a copy of the categories row
+        df_year_long["category"] = df_year_long["orig_cat_name"]
 
-    df_year = pd.concat([df_header, df_year[2:]], axis=0, join="outer").reset_index(
-        drop=True
-    )
+        # replace cat names by codes in col "category"
+        # first the manual replacements
+        df_year_long["category"] = df_year_long["category"].str.replace("\n", "")
 
-    df_year = pm2.pm2io.nir_add_unit_information(
-        df_year,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
-    )
+        df_year_long["category"] = df_year_long["category"].replace(
+            inv_conf["cat_codes_manual"]
+        )
 
-    print("Added unit information.")
+        df_year_long["category"] = df_year_long["category"].str.replace(".", "")
 
-    # set index
-    df_year = df_year.set_index(inv_conf["index_cols"])
+        # then the regex replacements
+        def repl(m):  # noqa: D103
+            return m.group("code")
 
-    # convert to long format
-    df_year_long = pm2.pm2io.nir_convert_df_to_long(
-        df_year, year, inv_conf["header_long"]
-    )
+        df_year_long["category"] = df_year_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    # extract from tuple
-    df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
+        df_year_long = df_year_long.reset_index(drop=True)
 
-    # prep for conversion to PM2 IF and native format
-    # make a copy of the categories row
-    df_year_long["category"] = df_year_long["orig_cat_name"]
+        df_year_long["data"] = df_year_long["data"].str.replace(",", ".")
 
-    # replace cat names by codes in col "category"
-    # first the manual replacements
-    df_year_long["category"] = df_year_long["category"].str.replace("\n", "")
+        # TODO: I don't think there are NE1 in the tables.
+        # df_year_long["data"] = df_year_long["data"].str.replace("NE1", "NE")
 
-    df_year_long["category"] = df_year_long["category"].replace(
-        inv_conf["cat_codes_manual"]
-    )
+        # make sure all col headers are str
+        df_year_long.columns = df_year_long.columns.map(str)
 
-    df_year_long["category"] = df_year_long["category"].str.replace(".", "")
+        df_year_long = df_year_long.drop(columns=["orig_cat_name"])
 
-    # then the regex replacements
-    def repl(m):  # noqa: D103
-        return m.group("code")
+        if df_all is None:
+            df_all = df_year_long
+        else:
+            df_all = pd.concat(
+                [df_all, df_year_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
 
-    df_year_long["category"] = df_year_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+    ### convert to interchange format ###
+    print("Converting to interchange format.")
+    df_all_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_all,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_year_long = df_year_long.reset_index(drop=True)
-
-    df_year_long["data"] = df_year_long["data"].str.replace(",", ".")
-
-    # TODO: I don't think there are NE1 in the tables.
-    # df_year_long["data"] = df_year_long["data"].str.replace("NE1", "NE")
-
-    # make sure all col headers are str
-    df_year_long.columns = df_year_long.columns.map(str)
-
-    df_year_long = df_year_long.drop(columns=["orig_cat_name"])
-
-    if df_all is None:
-        df_all = df_year_long
-    else:
-        df_all = pd.concat(
-            [df_all, df_year_long],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
-
-### convert to interchange format ###
-print("Converting to interchange format.")
-df_all_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_all,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping,
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_pm2 = pm2.pm2io.from_interchange_format(df_all_IF)
 
-### convert to primap2 format ###
-print("Converting to primap2 format.")
-data_pm2 = pm2.pm2io.from_interchange_format(df_all_IF)
+    # ###
+    # Save raw data to IF and native format.
+    # ###
 
+    data_if = data_pm2.pr.to_interchange_format()
 
-# ###
-# Save raw data to IF and native format.
-# ###
-
-data_if = data_pm2.pr.to_interchange_format()
-
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-    data_if,
-)
-
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-    encoding=encoding,
-)
-
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
-# ###
-# Processing
-# ###
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
-data_proc_pm2 = process_data_for_country(
-    data_country=data_pm2,
-    entities_to_ignore=[],
-    gas_baskets=gas_baskets,
-    filter_dims=None,
-    cat_terminology_out=None,
-    category_conversion=None,
-    sectors_out=None,
-    processing_info_country=country_processing_step1,
-)
+    # ###
+    # Processing
+    # ###
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        filter_dims=None,
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
+        processing_info_country=country_processing_step1,
+    )
 
-# ###
-# save processed data to IF and native format
-# ###
+    # ###
+    # save processed data to IF and native format
+    # ###
 
-terminology_proc = coords_terminologies["category"]
+    terminology_proc = coords_terminologies["category"]
 
-data_proc_if = data_proc_pm2.pr.to_interchange_format()
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
 
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + terminology_proc), data_proc_if
-)
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
-encoding = {var: compression for var in data_proc_pm2.data_vars}
-data_proc_pm2.pr.to_netcdf(
-    output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
-)
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )
 
-print("Saved processed data.")
+    print("Saved processed data.")

+ 6 - 3
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR4_from_xlsx.py

@@ -10,11 +10,12 @@ import sys
 
 import pandas as pd
 import primap2 as pm2
-from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
+from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+
 if __name__ == "__main__":
     # ###
     # configuration
@@ -182,12 +183,14 @@ if __name__ == "__main__":
             nrows=442,
             engine="openpyxl",
         )
-        # drop the columns which are empty and repetition of the metadata for the second block
+        # drop the columns which are empty and repetition of the metadata for the
+        # second block
         df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
-        # set multi-index. necessary for the stack operation in the conversion to long format
+        # set multi-index. necessary for the stack operation in the conversion to long
+        # format
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
         df_current = pm2.pm2io.nir_add_unit_information(

+ 6 - 3
src/unfccc_ghg_data/unfccc_reader/Chile/read_CHL_BUR5_from_xlsx.py

@@ -10,11 +10,12 @@ import sys
 
 import pandas as pd
 import primap2 as pm2
-from config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
+from .config_chl_bur4 import aggregate_cats, cat_mapping, filter_remove_IPCC2006
+
 if __name__ == "__main__":
     # ###
     # configuration
@@ -186,12 +187,14 @@ if __name__ == "__main__":
             nrows=442,
             engine="openpyxl",
         )
-        # drop the columns which are empty and repetition of the metadata for the second block
+        # drop the columns which are empty and repetition of the metadata for the
+        # second block
         df_current = df_current.drop(cols_to_drop, axis=1)
         # drop all rows where the index cols (category code and name) are both NaN
         # as without one of them there is no category information
         df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
-        # set multi-index. necessary for the stack operation in the conversion to long format
+        # set multi-index. necessary for the stack operation in the conversion to
+        # long format
         df_current = df_current.set_index(index_cols)
         # add unit row using information from entity row and add to index
         df_current = pm2.pm2io.nir_add_unit_information(

+ 566 - 548
src/unfccc_ghg_data/unfccc_reader/Guinea/read_GIN_BUR1_from_pdf.py

@@ -4,7 +4,12 @@ Read Guinea's BUR1 from pdf
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_gin_bur1 import (
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+from unfccc_ghg_data.helper.functions import process_data_for_country
+from unfccc_ghg_data.helper.functions_temp import find_and_replace_values
+
+from .config_gin_bur1 import (
     coords_cols,
     coords_defaults,
     coords_terminologies,
@@ -22,651 +27,664 @@ from config_gin_bur1 import (
     set_value,
 )
 
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-from unfccc_ghg_data.helper.functions import process_data_for_country
-from unfccc_ghg_data.helper.functions_temp import find_and_replace_values
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
 
-# ###
-# configuration
-# ###
+    input_folder = downloaded_data_path / "UNFCCC" / "Guinea" / "BUR1"
+    output_folder = extracted_data_path / "UNFCCC" / "Guinea"
+    if not output_folder.exists():
+        output_folder.mkdir()
 
-input_folder = downloaded_data_path / "UNFCCC" / "Guinea" / "BUR1"
-output_folder = extracted_data_path / "UNFCCC" / "Guinea"
-if not output_folder.exists():
-    output_folder.mkdir()
+    pdf_file = "Rapport_IGES-Guinee-BUR1_VF.pdf"
+    output_filename = "GIN_BUR1_2023_"
+    category_column = f"category ({coords_terminologies['category']})"
+    compression = dict(zlib=True, complevel=9)
 
-pdf_file = "Rapport_IGES-Guinee-BUR1_VF.pdf"
-output_filename = "GIN_BUR1_2023_"
-category_column = f"category ({coords_terminologies['category']})"
-compression = dict(zlib=True, complevel=9)
+    def repl(m):  # noqa: D103
+        return m.group("code")
 
+    # ###
+    # 1. Read in main tables
+    # ###
 
-def repl(m):  # noqa: D103
-    return m.group("code")
+    df_main = None
+    for page in inv_conf["pages_to_read"]["main"]:
+        print("-" * 45)
+        print(f"Reading table from page {page}.")
 
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file),
+            pages=page,
+            table_areas=page_def_templates[page]["area"],
+            columns=page_def_templates[page]["cols"],
+            flavor="stream",
+            split_text=True,
+        )
 
-# ###
-# 1. Read in main tables
-# ###
+        print("Reading complete.")
 
-df_main = None
-for page in inv_conf["pages_to_read"]["main"]:
-    print("-" * 45)
-    print(f"Reading table from page {page}.")
+        df_inventory = tables_inventory_original[0].df.copy()
 
-    tables_inventory_original = camelot.read_pdf(
-        str(input_folder / pdf_file),
-        pages=page,
-        table_areas=page_def_templates[page]["area"],
-        columns=page_def_templates[page]["cols"],
-        flavor="stream",
-        split_text=True,
-    )
+        # set category names (they moved one row up)
+        if page in set_value["main"].keys():
+            for idx, col, value in set_value["main"][page]:
+                df_inventory.loc[idx, col] = value
+        # delete empty row
+        if page in delete_row["main"].keys():
+            for idx in delete_row["main"][page]:
+                df_inventory = df_inventory.drop(index=idx)
 
-    print("Reading complete.")
-
-    df_inventory = tables_inventory_original[0].df.copy()
-
-    # set category names (they moved one row up)
-    if page in set_value["main"].keys():
-        for idx, col, value in set_value["main"][page]:
-            df_inventory.loc[idx, col] = value
-    # delete empty row
-    if page in delete_row["main"].keys():
-        for idx in delete_row["main"][page]:
-            df_inventory = df_inventory.drop(index=idx)
-
-    # add header and unit
-    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
-    df_inventory = pd.concat(
-        [df_header, df_inventory], axis=0, join="outer"
-    ).reset_index(drop=True)
-    df_inventory = pm2.pm2io.nir_add_unit_information(
-        df_inventory,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
-    )
+        # add header and unit
+        df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+        df_inventory = pd.concat(
+            [df_header, df_inventory], axis=0, join="outer"
+        ).reset_index(drop=True)
+        df_inventory = pm2.pm2io.nir_add_unit_information(
+            df_inventory,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
+        )
 
-    print("Added unit information.")
+        print("Added unit information.")
 
-    # set index
-    df_inventory = df_inventory.set_index(inv_conf["index_cols"])
+        # set index
+        df_inventory = df_inventory.set_index(inv_conf["index_cols"])
 
-    # convert to long format
-    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
-        df_inventory, inv_conf["year"][page], inv_conf["header_long"]
-    )
+        # convert to long format
+        df_inventory_long = pm2.pm2io.nir_convert_df_to_long(
+            df_inventory, inv_conf["year"][page], inv_conf["header_long"]
+        )
 
-    # extract category from tuple
-    df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
+        # extract category from tuple
+        df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0]
 
-    # prep for conversion to PM2 IF and native format
-    df_inventory_long["category"] = df_inventory_long["orig_cat_name"]
+        # prep for conversion to PM2 IF and native format
+        df_inventory_long["category"] = df_inventory_long["orig_cat_name"]
 
-    df_inventory_long["category"] = df_inventory_long["category"].replace(
-        inv_conf["cat_codes_manual"]["main"]
-    )
+        df_inventory_long["category"] = df_inventory_long["category"].replace(
+            inv_conf["cat_codes_manual"]["main"]
+        )
+
+        df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+            ".", ""
+        )
+
+        # regex replacements
+        df_inventory_long["category"] = df_inventory_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    df_inventory_long["category"] = df_inventory_long["category"].str.replace(".", "")
+        df_inventory_long = df_inventory_long.reset_index(drop=True)
+
+        df_inventory_long["data"] = df_inventory_long["data"].str.replace(",", ".")
+        df_inventory_long["data"] = df_inventory_long["data"].str.replace("NE1", "NE")
+
+        # make sure all col headers are str
+        df_inventory_long.columns = df_inventory_long.columns.map(str)
+        df_inventory_long = df_inventory_long.drop(columns=["orig_cat_name"])
+
+        if df_main is None:
+            df_main = df_inventory_long
+        else:
+            df_main = pd.concat(
+                [df_main, df_inventory_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print("Converting to interchange format.")
+    df_all_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_main,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping["main"],
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
 
-    # regex replacements
-    df_inventory_long["category"] = df_inventory_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+    df_all_IF = find_and_replace_values(
+        df=df_all_IF, replace_info=replace_info["main"], category_column=category_column
     )
 
-    df_inventory_long = df_inventory_long.reset_index(drop=True)
+    ### convert to primap2 format ###
+    data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
 
-    df_inventory_long["data"] = df_inventory_long["data"].str.replace(",", ".")
-    df_inventory_long["data"] = df_inventory_long["data"].str.replace("NE1", "NE")
+    # ###
+    # 2. Read energy sector tables
+    # ###
 
-    # make sure all col headers are str
-    df_inventory_long.columns = df_inventory_long.columns.map(str)
-    df_inventory_long = df_inventory_long.drop(columns=["orig_cat_name"])
+    df_energy = None
+    for page in inv_conf["pages_to_read"]["energy"]:
+        print("-" * 45)
+        print(f"Reading table from page {page}.")
 
-    if df_main is None:
-        df_main = df_inventory_long
-    else:
-        df_main = pd.concat(
-            [df_main, df_inventory_long],
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
+        )
+
+        print("Reading complete.")
+
+        df_energy_year = pd.concat(
+            [tables_inventory_original[0].df[2:], tables_inventory_original[1].df[3:]],
             axis=0,
             join="outer",
         ).reset_index(drop=True)
 
-print("Converting to interchange format.")
-df_all_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_main,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping["main"],
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
+        # TODO This step should be done in pm2.pm2io.convert_long_dataframe_if()
+        for row in delete_rows_by_category["energy"][page]:
+            row_to_delete = df_energy_year.index[df_energy_year[0] == row][0]
+            df_energy_year = df_energy_year.drop(index=row_to_delete)
 
-df_all_IF = find_and_replace_values(
-    df=df_all_IF, replace_info=replace_info["main"], category_column=category_column
-)
+        # add header and unit
+        df_header = pd.DataFrame([inv_conf["header_energy"], inv_conf["unit_energy"]])
 
-### convert to primap2 format ###
-data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)
+        df_energy_year = pd.concat(
+            [df_header, df_energy_year], axis=0, join="outer"
+        ).reset_index(drop=True)
 
-# ###
-# 2. Read energy sector tables
-# ###
+        df_energy_year = pm2.pm2io.nir_add_unit_information(
+            df_energy_year,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
+        )
 
-df_energy = None
-for page in inv_conf["pages_to_read"]["energy"]:
-    print("-" * 45)
-    print(f"Reading table from page {page}.")
+        print("Added unit information.")
+        # set index
+        df_energy_year = df_energy_year.set_index(inv_conf["index_cols"])
 
-    tables_inventory_original = camelot.read_pdf(
-        str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
-    )
+        # convert to long format
+        df_energy_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_energy_year, inv_conf["year"][page], inv_conf["header_long"]
+        )
 
-    print("Reading complete.")
-
-    df_energy_year = pd.concat(
-        [tables_inventory_original[0].df[2:], tables_inventory_original[1].df[3:]],
-        axis=0,
-        join="outer",
-    ).reset_index(drop=True)
-
-    # TODO This step should be done in pm2.pm2io.convert_long_dataframe_if()
-    for row in delete_rows_by_category["energy"][page]:
-        row_to_delete = df_energy_year.index[df_energy_year[0] == row][0]
-        df_energy_year = df_energy_year.drop(index=row_to_delete)
-
-    # add header and unit
-    df_header = pd.DataFrame([inv_conf["header_energy"], inv_conf["unit_energy"]])
-
-    df_energy_year = pd.concat(
-        [df_header, df_energy_year], axis=0, join="outer"
-    ).reset_index(drop=True)
-
-    df_energy_year = pm2.pm2io.nir_add_unit_information(
-        df_energy_year,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
-    )
+        # extract from tuple
+        df_energy_year_long["orig_cat_name"] = df_energy_year_long["orig_cat_name"].str[
+            0
+        ]
 
-    print("Added unit information.")
-    # set index
-    df_energy_year = df_energy_year.set_index(inv_conf["index_cols"])
+        # prep for conversion to PM2 IF and native format
+        # make a copy of the categories row
+        df_energy_year_long["category"] = df_energy_year_long["orig_cat_name"]
 
-    # convert to long format
-    df_energy_year_long = pm2.pm2io.nir_convert_df_to_long(
-        df_energy_year, inv_conf["year"][page], inv_conf["header_long"]
-    )
+        # replace cat names by codes in col "category"
+        # first the manual replacements
+        df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
+            "\n", ""
+        )
+        df_energy_year_long["category"] = df_energy_year_long["category"].replace(
+            inv_conf["cat_codes_manual"]["energy"]
+        )
 
-    # extract from tuple
-    df_energy_year_long["orig_cat_name"] = df_energy_year_long["orig_cat_name"].str[0]
+        df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
+            ".", ""
+        )
 
-    # prep for conversion to PM2 IF and native format
-    # make a copy of the categories row
-    df_energy_year_long["category"] = df_energy_year_long["orig_cat_name"]
+        # then the regex replacements
+        df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    # replace cat names by codes in col "category"
-    # first the manual replacements
-    df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
-        "\n", ""
-    )
-    df_energy_year_long["category"] = df_energy_year_long["category"].replace(
-        inv_conf["cat_codes_manual"]["energy"]
-    )
+        df_energy_year_long = df_energy_year_long.reset_index(drop=True)
 
-    df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
-        ".", ""
-    )
+        df_energy_year_long["data"] = df_energy_year_long["data"].str.replace(",", ".")
+        df_energy_year_long["data"] = df_energy_year_long["data"].str.replace(
+            "NE1", "NE"
+        )
 
-    # then the regex replacements
-    df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+        # make sure all col headers are str
+        df_energy_year_long.columns = df_energy_year_long.columns.map(str)
+        df_energy_year_long = df_energy_year_long.drop(columns=["orig_cat_name"])
+
+        if df_energy is None:
+            df_energy = df_energy_year_long
+        else:
+            df_energy = pd.concat(
+                [df_energy, df_energy_year_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print("Converting to interchange format.")
+    df_energy_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_energy,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping["energy"],
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_energy_year_long = df_energy_year_long.reset_index(drop=True)
+    ### convert to primap2 format ###
+    data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)
 
-    df_energy_year_long["data"] = df_energy_year_long["data"].str.replace(",", ".")
-    df_energy_year_long["data"] = df_energy_year_long["data"].str.replace("NE1", "NE")
+    # ###
+    # 3. Read in afolu table
+    # ###
 
-    # make sure all col headers are str
-    df_energy_year_long.columns = df_energy_year_long.columns.map(str)
-    df_energy_year_long = df_energy_year_long.drop(columns=["orig_cat_name"])
+    df_afolu = None
+    for page in inv_conf["pages_to_read"]["afolu"]:
+        print("-" * 45)
+        print(f"Reading table from page {page}.")
 
-    if df_energy is None:
-        df_energy = df_energy_year_long
-    else:
-        df_energy = pd.concat(
-            [df_energy, df_energy_year_long],
-            axis=0,
-            join="outer",
+        tables_inventory_original = camelot.read_pdf(
+            str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
+        )
+        print("Reading complete.")
+
+        if page == "127":
+            # table on page 127 has one extra row at the top
+            # and one extra category 3.A.1.j
+            df_afolu_year = tables_inventory_original[0].df[3:]
+            # 3.A.1.a.i to 3.A.1.j exist twice.
+            # Rename duplicate categories in tables.
+            for index, category_name in replace_categories["afolu"]["127"]:
+                df_afolu_year.loc[index, 0] = category_name
+        else:
+            # cut first two lines
+            df_afolu_year = tables_inventory_original[0].df[2:]
+            # On pages 124-126 the wrong categories are slightly different
+            for index, category_name in replace_categories["afolu"]["124-126"]:
+                df_afolu_year.loc[index, 0] = category_name
+
+        # add header and unit
+        df_header = pd.DataFrame([inv_conf["header_afolu"], inv_conf["unit_afolu"]])
+
+        df_afolu_year = pd.concat(
+            [df_header, df_afolu_year], axis=0, join="outer"
         ).reset_index(drop=True)
 
-print("Converting to interchange format.")
-df_energy_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_energy,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping["energy"],
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
-### convert to primap2 format ###
-data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)
-
-# ###
-# 3. Read in afolu table
-# ###
+        df_afolu_year = pm2.pm2io.nir_add_unit_information(
+            df_afolu_year,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
+        )
 
-df_afolu = None
-for page in inv_conf["pages_to_read"]["afolu"]:
-    print("-" * 45)
-    print(f"Reading table from page {page}.")
+        print("Added unit information.")
 
-    tables_inventory_original = camelot.read_pdf(
-        str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
-    )
-    print("Reading complete.")
-
-    if page == "127":
-        # table on page 127 has one extra row at the top
-        # and one extra category 3.A.1.j
-        df_afolu_year = tables_inventory_original[0].df[3:]
-        # 3.A.1.a.i to 3.A.1.j exist twice.
-        # Rename duplicate categories in tables.
-        for index, category_name in replace_categories["afolu"]["127"]:
-            df_afolu_year.loc[index, 0] = category_name
-    else:
-        # cut first two lines
-        df_afolu_year = tables_inventory_original[0].df[2:]
-        # On pages 124-126 the wrong categories are slightly different
-        for index, category_name in replace_categories["afolu"]["124-126"]:
-            df_afolu_year.loc[index, 0] = category_name
-
-    # add header and unit
-    df_header = pd.DataFrame([inv_conf["header_afolu"], inv_conf["unit_afolu"]])
-
-    df_afolu_year = pd.concat(
-        [df_header, df_afolu_year], axis=0, join="outer"
-    ).reset_index(drop=True)
-
-    df_afolu_year = pm2.pm2io.nir_add_unit_information(
-        df_afolu_year,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
-    )
+        # set index
+        df_afolu_year = df_afolu_year.set_index(inv_conf["index_cols"])
 
-    print("Added unit information.")
-
-    # set index
-    df_afolu_year = df_afolu_year.set_index(inv_conf["index_cols"])
+        # convert to long format
+        df_afolu_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_afolu_year, inv_conf["year"][page], inv_conf["header_long"]
+        )
 
-    # convert to long format
-    df_afolu_year_long = pm2.pm2io.nir_convert_df_to_long(
-        df_afolu_year, inv_conf["year"][page], inv_conf["header_long"]
-    )
+        df_afolu_year_long["orig_cat_name"] = df_afolu_year_long["orig_cat_name"].str[0]
 
-    df_afolu_year_long["orig_cat_name"] = df_afolu_year_long["orig_cat_name"].str[0]
+        # prep for conversion to PM2 IF and native format
+        # make a copy of the categories row
+        df_afolu_year_long["category"] = df_afolu_year_long["orig_cat_name"]
 
-    # prep for conversion to PM2 IF and native format
-    # make a copy of the categories row
-    df_afolu_year_long["category"] = df_afolu_year_long["orig_cat_name"]
+        # regex replacements
+        df_afolu_year_long["category"] = df_afolu_year_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    # regex replacements
-    df_afolu_year_long["category"] = df_afolu_year_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+        df_afolu_year_long = df_afolu_year_long.reset_index(drop=True)
+
+        df_afolu_year_long["data"] = df_afolu_year_long["data"].str.replace(",", ".")
+        df_afolu_year_long["data"] = df_afolu_year_long["data"].str.replace("NE1", "NE")
+
+        # make sure all col headers are str
+        df_afolu_year_long.columns = df_afolu_year_long.columns.map(str)
+        df_afolu_year_long = df_afolu_year_long.drop(columns=["orig_cat_name"])
+
+        if df_afolu is None:
+            df_afolu = df_afolu_year_long
+        else:
+            df_afolu = pd.concat(
+                [df_afolu, df_afolu_year_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print("Converting to interchange format.")
+    df_afolu_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_afolu,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping["afolu"],
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_afolu_year_long = df_afolu_year_long.reset_index(drop=True)
-
-    df_afolu_year_long["data"] = df_afolu_year_long["data"].str.replace(",", ".")
-    df_afolu_year_long["data"] = df_afolu_year_long["data"].str.replace("NE1", "NE")
-
-    # make sure all col headers are str
-    df_afolu_year_long.columns = df_afolu_year_long.columns.map(str)
-    df_afolu_year_long = df_afolu_year_long.drop(columns=["orig_cat_name"])
-
-    if df_afolu is None:
-        df_afolu = df_afolu_year_long
-    else:
-        df_afolu = pd.concat(
-            [df_afolu, df_afolu_year_long],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
-
-print("Converting to interchange format.")
-df_afolu_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_afolu,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping["afolu"],
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
-### convert to primap2 format ###
-data_pm2_afolu = pm2.pm2io.from_interchange_format(df_afolu_IF)
+    ### convert to primap2 format ###
+    data_pm2_afolu = pm2.pm2io.from_interchange_format(df_afolu_IF)
 
-# ###
-# 4. Read in Waste tables - pages 128, 130
-# ###
+    # ###
+    # 4. Read in Waste tables - pages 128, 130
+    # ###
 
-# There are three tables for three years on page 128
-# and another table for the last year on page 130
-
-# read the first three tables
-page = inv_conf["pages_to_read"]["waste"][0]
-tables_inventory_original_128 = camelot.read_pdf(
-    str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
-)
-
-# read last table
-page = inv_conf["pages_to_read"]["waste"][1]
-tables_inventory_original_130 = camelot.read_pdf(
-    str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
-)
+    # There are three tables for three years on page 128
+    # and another table for the last year on page 130
 
-# combine in a dict
-df_waste_years = {
-    "1990": tables_inventory_original_128[0].df,
-    "2000": tables_inventory_original_128[1].df,
-    "2010": tables_inventory_original_128[2].df,
-    "2019": tables_inventory_original_130[0].df,
-}
-
-df_waste = None
-for year in df_waste_years.keys():
-    print("-" * 45)
-    print(f"Processing table for {year}.")
-
-    df_waste_year = df_waste_years[year][2:]
-
-    # add header and unit
-    df_header = pd.DataFrame([inv_conf["header_waste"], inv_conf["unit_waste"]])
-
-    df_waste_year = pd.concat(
-        [df_header, df_waste_year], axis=0, join="outer"
-    ).reset_index(drop=True)
-
-    df_waste_year = pm2.pm2io.nir_add_unit_information(
-        df_waste_year,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
+    # read the first three tables
+    page = inv_conf["pages_to_read"]["waste"][0]
+    tables_inventory_original_128 = camelot.read_pdf(
+        str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
     )
 
-    print("Added unit information.")
-
-    # set index
-    df_waste_year = df_waste_year.set_index(inv_conf["index_cols"])
-
-    # convert to long format
-    df_waste_year_long = pm2.pm2io.nir_convert_df_to_long(
-        df_waste_year, year, inv_conf["header_long"]
+    # read last table
+    page = inv_conf["pages_to_read"]["waste"][1]
+    tables_inventory_original_130 = camelot.read_pdf(
+        str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
     )
 
-    df_waste_year_long["orig_cat_name"] = df_waste_year_long["orig_cat_name"].str[0]
+    # combine in a dict
+    df_waste_years = {
+        "1990": tables_inventory_original_128[0].df,
+        "2000": tables_inventory_original_128[1].df,
+        "2010": tables_inventory_original_128[2].df,
+        "2019": tables_inventory_original_130[0].df,
+    }
 
-    # prep for conversion to PM2 IF and native format
-    # make a copy of the categories row
-    df_waste_year_long["category"] = df_waste_year_long["orig_cat_name"]
+    df_waste = None
+    for year in df_waste_years.keys():
+        print("-" * 45)
+        print(f"Processing table for {year}.")
 
-    # regex replacements
-    df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
-    )
+        df_waste_year = df_waste_years[year][2:]
 
-    df_waste_year_long = df_waste_year_long.reset_index(drop=True)
+        # add header and unit
+        df_header = pd.DataFrame([inv_conf["header_waste"], inv_conf["unit_waste"]])
 
-    df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(".", "")
-    df_waste_year_long["data"] = df_waste_year_long["data"].str.replace(",", ".")
-    df_waste_year_long["data"] = df_waste_year_long["data"].str.replace("NE1", "NE")
+        df_waste_year = pd.concat(
+            [df_header, df_waste_year], axis=0, join="outer"
+        ).reset_index(drop=True)
 
-    # make sure all col headers are str
-    df_waste_year_long.columns = df_waste_year_long.columns.map(str)
-    df_waste_year_long = df_waste_year_long.drop(columns=["orig_cat_name"])
+        df_waste_year = pm2.pm2io.nir_add_unit_information(
+            df_waste_year,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
+        )
 
-    if df_waste is None:
-        df_waste = df_waste_year_long
-    else:
-        df_waste = pd.concat(
-            [df_waste, df_waste_year_long],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
+        print("Added unit information.")
 
-print("Converting to interchange format.")
-df_waste_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_waste,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping["waste"],
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
+        # set index
+        df_waste_year = df_waste_year.set_index(inv_conf["index_cols"])
 
-### convert to primap2 format ###
-data_pm2_waste = pm2.pm2io.from_interchange_format(df_waste_IF)
+        # convert to long format
+        df_waste_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_waste_year, year, inv_conf["header_long"]
+        )
 
-# ###
-# 5. Read in trend tables - pages 131 - 137
-# ###
+        df_waste_year_long["orig_cat_name"] = df_waste_year_long["orig_cat_name"].str[0]
 
-df_trend = None
-pages = inv_conf["pages_to_read"]["trend"]
-entities = inv_conf["entity_for_page"]["trend"]
+        # prep for conversion to PM2 IF and native format
+        # make a copy of the categories row
+        df_waste_year_long["category"] = df_waste_year_long["orig_cat_name"]
 
-# for this set of tables every page is a different entity
-for page, entity in zip(pages, entities):
-    print("-" * 45)
-    print(f"Reading table for page {page} and entity {entity}.")
+        # regex replacements
+        df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    # First table must be read in with flavor="stream", as
-    # flavor="lattice" raises an error. Maybe camelot issue
-    # see https://github.com/atlanhq/camelot/issues/306,
-    # or because characters in first row almost touch
-    # the table grid.
-    if page == "131":
-        tables_inventory_original = camelot.read_pdf(
-            str(input_folder / pdf_file),
-            pages=page,
-            table_areas=page_def_templates[page]["area"],
-            columns=page_def_templates[page]["cols"],
-            flavor="stream",
-            split_text=True,
+        df_waste_year_long = df_waste_year_long.reset_index(drop=True)
+
+        df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(
+            ".", ""
         )
+        df_waste_year_long["data"] = df_waste_year_long["data"].str.replace(",", ".")
+        df_waste_year_long["data"] = df_waste_year_long["data"].str.replace("NE1", "NE")
+
+        # make sure all col headers are str
+        df_waste_year_long.columns = df_waste_year_long.columns.map(str)
+        df_waste_year_long = df_waste_year_long.drop(columns=["orig_cat_name"])
+
+        if df_waste is None:
+            df_waste = df_waste_year_long
+        else:
+            df_waste = pd.concat(
+                [df_waste, df_waste_year_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print("Converting to interchange format.")
+    df_waste_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_waste,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping["waste"],
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
+    )
 
-        df_trend_entity = tables_inventory_original[0].df[1:]
+    ### convert to primap2 format ###
+    data_pm2_waste = pm2.pm2io.from_interchange_format(df_waste_IF)
+
+    # ###
+    # 5. Read in trend tables - pages 131 - 137
+    # ###
+
+    df_trend = None
+    pages = inv_conf["pages_to_read"]["trend"]
+    entities = inv_conf["entity_for_page"]["trend"]
+
+    # for this set of tables every page is a different entity
+    for page, entity in zip(pages, entities):
+        print("-" * 45)
+        print(f"Reading table for page {page} and entity {entity}.")
+
+        # First table must be read in with flavor="stream", as
+        # flavor="lattice" raises an error. Maybe camelot issue
+        # see https://github.com/atlanhq/camelot/issues/306,
+        # or because characters in first row almost touch
+        # the table grid.
+        if page == "131":
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                table_areas=page_def_templates[page]["area"],
+                columns=page_def_templates[page]["cols"],
+                flavor="stream",
+                split_text=True,
+            )
+
+            df_trend_entity = tables_inventory_original[0].df[1:]
+
+        else:
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                flavor="lattice",
+                split_text=True,
+            )
+            df_trend_entity = tables_inventory_original[0].df[3:]
+
+        print("Reading complete.")
+
+        if page in delete_rows_by_category["trend"].keys():
+            for category in delete_rows_by_category["trend"][page]:
+                row_to_delete = df_trend_entity.index[df_trend_entity[0] == category][0]
+                df_trend_entity = df_trend_entity.drop(index=row_to_delete)
+
+        df_trend_entity.columns = inv_conf["header_trend"]
+
+        df_trend_entity = df_trend_entity.copy()
+
+        # unit is always Gg
+        df_trend_entity.loc[:, "unit"] = "Gg"
+
+        # only one entity per table
+        df_trend_entity.loc[:, "entity"] = entity
+
+        df_trend_entity.loc[:, "category"] = df_trend_entity["orig_cat_name"]
+
+        df_trend_entity["category"] = df_trend_entity["category"].replace(
+            inv_conf["cat_codes_manual"]["trend"]
+        )
 
-    else:
-        tables_inventory_original = camelot.read_pdf(
-            str(input_folder / pdf_file), pages=page, flavor="lattice", split_text=True
+        df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
+            ".", ""
+        )
+        df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
+            "\n", ""
         )
-        df_trend_entity = tables_inventory_original[0].df[3:]
 
-    print("Reading complete.")
+        df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    if page in delete_rows_by_category["trend"].keys():
-        for category in delete_rows_by_category["trend"][page]:
-            row_to_delete = df_trend_entity.index[df_trend_entity[0] == category][0]
-            df_trend_entity = df_trend_entity.drop(index=row_to_delete)
+        df_trend_entity = df_trend_entity.reset_index(drop=True)
 
-    df_trend_entity.columns = inv_conf["header_trend"]
+        print("Created category codes.")
 
-    df_trend_entity = df_trend_entity.copy()
+        for year in inv_conf["header_trend"][1:]:
+            df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace(",", ".")
+            df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace(
+                "NE1", "NE"
+            )
 
-    # unit is always Gg
-    df_trend_entity.loc[:, "unit"] = "Gg"
+        # make sure all col headers are str
+        df_trend_entity.columns = df_trend_entity.columns.map(str)
 
-    # only one entity per table
-    df_trend_entity.loc[:, "entity"] = entity
+        df_trend_entity = df_trend_entity.drop(columns=["orig_cat_name"])
 
-    df_trend_entity.loc[:, "category"] = df_trend_entity["orig_cat_name"]
+        # TODO better to use pm2.pm2io.convert_wide_dataframe_if
+        df_trend_entity_long = pd.wide_to_long(
+            df_trend_entity, stubnames="data", i="category", j="time"
+        )
 
-    df_trend_entity["category"] = df_trend_entity["category"].replace(
-        inv_conf["cat_codes_manual"]["trend"]
+        print("Converted to long format.")
+
+        df_trend_entity_long = df_trend_entity_long.reset_index()
+
+        if df_trend is None:
+            df_trend = df_trend_entity_long
+        else:
+            df_trend = pd.concat(
+                [df_trend, df_trend_entity_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
+
+    print("Converting to interchange format.")
+
+    df_trend_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_trend,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping["trend"],
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
-        ".", ""
-    )
-    df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
-        "\n", ""
+    df_trend_IF = find_and_replace_values(
+        df=df_trend_IF,
+        replace_info=replace_info["trend"],
+        category_column=category_column,
     )
 
-    df_trend_entity.loc[:, "category"] = df_trend_entity["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+    ### convert to primap2 format ###
+    data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)
+
+    # ###
+    # Combine tables
+    # ###
+
+    # merge main and energy
+    # There are discrepancies larger than 0.86 for area category 1.A.2, entity NMVOC,
+    # years 1990, 2000, 2010, 2019
+    # It is assumed the main table has the correct values.
+    print("Merging main and energy table.")
+    data_pm2 = data_pm2_main.pr.merge(data_pm2_energy, tolerance=1)
+
+    # merge afolu
+    print("Merging afolu table.")
+    data_pm2 = data_pm2.pr.merge(data_pm2_afolu, tolerance=0.11)
+
+    # merge waste
+    # increasing tolerance to merge values for 4.C, 1990, N2O - 0.003 in sector table,
+    # 0.0034 in main table
+    print("Merging waste table.")
+    data_pm2 = data_pm2.pr.merge(data_pm2_waste, tolerance=0.15)
+
+    # merge trend
+    print("Merging trend table.")
+    data_pm2 = data_pm2.pr.merge(data_pm2_trend, tolerance=0.11)
+
+    # convert back to IF to have units in the fixed format
+    # ( per year / per a / per annum)
+    data_if = data_pm2.pr.to_interchange_format()
+
+    # ###
+    # Save raw data to IF and native format.
+    # ###
+
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
     )
 
-    df_trend_entity = df_trend_entity.reset_index(drop=True)
-
-    print("Created category codes.")
-
-    for year in inv_conf["header_trend"][1:]:
-        df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace(",", ".")
-        df_trend_entity.loc[:, year] = df_trend_entity[year].str.replace("NE1", "NE")
-
-    # make sure all col headers are str
-    df_trend_entity.columns = df_trend_entity.columns.map(str)
-
-    df_trend_entity = df_trend_entity.drop(columns=["orig_cat_name"])
-
-    # TODO better to use pm2.pm2io.convert_wide_dataframe_if
-    df_trend_entity_long = pd.wide_to_long(
-        df_trend_entity, stubnames="data", i="category", j="time"
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
     )
 
-    print("Converted to long format.")
-
-    df_trend_entity_long = df_trend_entity_long.reset_index()
-
-    if df_trend is None:
-        df_trend = df_trend_entity_long
-    else:
-        df_trend = pd.concat(
-            [df_trend, df_trend_entity_long],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
-
-print("Converting to interchange format.")
-
-df_trend_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_trend,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping["trend"],
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
-df_trend_IF = find_and_replace_values(
-    df=df_trend_IF, replace_info=replace_info["trend"], category_column=category_column
-)
-
-### convert to primap2 format ###
-data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)
-
-# ###
-# Combine tables
-# ###
-
-# merge main and energy
-# There are discrepancies larger than 0.86 for area category 1.A.2, entity NMVOC,
-# years 1990, 2000, 2010, 2019
-# It is assumed the main table has the correct values.
-print("Merging main and energy table.")
-data_pm2 = data_pm2_main.pr.merge(data_pm2_energy, tolerance=1)
-
-# merge afolu
-print("Merging afolu table.")
-data_pm2 = data_pm2.pr.merge(data_pm2_afolu, tolerance=0.11)
-
-# merge waste
-# increasing tolerance to merge values for 4.C, 1990, N2O - 0.003 in sector table, 0.0034 in main table
-print("Merging waste table.")
-data_pm2 = data_pm2.pr.merge(data_pm2_waste, tolerance=0.15)
-
-# merge trend
-print("Merging trend table.")
-data_pm2 = data_pm2.pr.merge(data_pm2_trend, tolerance=0.11)
-
-# convert back to IF to have units in the fixed format ( per year / per a / per annum)
-data_if = data_pm2.pr.to_interchange_format()
-
-# ###
-# Save raw data to IF and native format.
-# ###
-
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-    data_if,
-)
-
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-    encoding=encoding,
-)
-
-# ###
-# Processing
-# ###
-
-data_proc_pm2 = process_data_for_country(
-    data_country=data_pm2,
-    entities_to_ignore=[],
-    gas_baskets=gas_baskets,
-    filter_dims=None,  # leaving this explicit for now
-    cat_terminology_out=None,
-    category_conversion=None,
-    sectors_out=None,
-    processing_info_country=country_processing_step1,
-)
+    # ###
+    # Processing
+    # ###
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        filter_dims=None,  # leaving this explicit for now
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
+        processing_info_country=country_processing_step1,
+    )
 
-# ###
-# save processed data to IF and native format
-# ###
+    # ###
+    # save processed data to IF and native format
+    # ###
 
-terminology_proc = coords_terminologies["category"]
+    terminology_proc = coords_terminologies["category"]
 
-data_proc_if = data_proc_pm2.pr.to_interchange_format()
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
 
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + terminology_proc), data_proc_if
-)
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
-encoding = {var: compression for var in data_proc_pm2.data_vars}
-data_proc_pm2.pr.to_netcdf(
-    output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
-)
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )
 
-print("Saved processed data.")
+    print("Saved processed data.")

+ 7 - 7
src/unfccc_ghg_data/unfccc_reader/Israel/read_ISR_BUR2_from_pdf.py

@@ -16,8 +16,14 @@ import camelot
 import pandas as pd
 import primap2 as pm2
 
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
+
 # configuration import
-from config_isr_bur2 import (
+from .config_isr_bur2 import (
     basket_copy,
     cat_conversion,
     cats_to_agg,
@@ -37,12 +43,6 @@ from config_isr_bur2 import (
     trend_table_def,
 )
 
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    process_data_for_country,
-)
-
 if __name__ == "__main__":
     ### genral configuration
     input_folder = downloaded_data_path / "UNFCCC" / "Israel" / "BUR2"

+ 10 - 9
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR3_from_pdf.py

@@ -9,7 +9,16 @@ Data are read from pdf using camelot
 
 import camelot
 import primap2 as pm2
-from config_mys_bur3 import (
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    process_data_for_country,
+)
+
+from .config_mys_bur3 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_names_fix,
@@ -27,14 +36,6 @@ from config_mys_bur3 import (
     terminology_proc,
     values_replacement,
 )
-from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    process_data_for_country,
-)
 
 if __name__ == "__main__":
     # ###

+ 10 - 9
src/unfccc_ghg_data/unfccc_reader/Malaysia/read_MYS_BUR4_from_pdf.py

@@ -10,7 +10,16 @@ Code ist mostly identical to BUR3
 
 import camelot
 import primap2 as pm2
-from config_mys_bur4 import (
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    process_data_for_country,
+)
+
+from .config_mys_bur4 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_names_fix,
@@ -28,14 +37,6 @@ from config_mys_bur4 import (
     terminology_proc,
     values_replacement,
 )
-from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    process_data_for_country,
-)
 
 if __name__ == "__main__":
     # ###

+ 5 - 3
src/unfccc_ghg_data/unfccc_reader/Mexico/read_MEX_BUR3_from_pdf.py

@@ -9,10 +9,11 @@ Data are read from pdf using camelot
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_mex_bur3 import fix_rows, page_defs
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
+from .config_mex_bur3 import fix_rows, page_defs
+
 if __name__ == "__main__":
     # ###
     # configuration
@@ -49,7 +50,7 @@ if __name__ == "__main__":
     # manual category codes
     cat_codes_manual = {
         "Todas las emisiones y las absorciones nacionales": "0",
-        "Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada": "M0EL",
+        "Todas las emisiones (sin [3B] Tierra ni [3D1] Productos de madera recolectada": "M0EL",  # noqa: E501
         "2F6 Otras aplicaciones": "2F6",
     }
 
@@ -120,7 +121,8 @@ if __name__ == "__main__":
 
         # fix rows
         for n_rows in page_def["rows_to_fix"].keys():
-            # replace line breaks, long hyphens, double, and triple spaces in category names
+            # replace line breaks, long hyphens, double, and triple spaces in category
+            # names
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")

+ 273 - 270
src/unfccc_ghg_data/unfccc_reader/Mongolia/read_MNG_BUR2_from_pdf.py

@@ -4,7 +4,15 @@ Read Mongolia's BUR2 from pdf
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_mng_bur2 import (
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    process_data_for_country,
+)
+
+from .config_mng_bur2 import (
     coords_cols,
     coords_defaults,
     coords_terminologies,
@@ -18,320 +26,315 @@ from config_mng_bur2 import (
     meta_data,
 )
 
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    process_data_for_country,
-)
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+
+    input_folder = downloaded_data_path / "UNFCCC" / "Mongolia" / "BUR2"
+    output_folder = extracted_data_path / "UNFCCC" / "Mongolia"
+
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    pdf_file = "20231112_NIR_MGL.pdf"
+    output_filename = "MNG_BUR2_2023_"
+    category_column = f"category ({coords_terminologies['category']})"
+    compression = dict(zlib=True, complevel=9)
+
+    def repl(m):  # noqa: D103
+        return m.group("code")
+
+    # ###
+    # 1. Read in main tables
+    # ###
+
+    df_main = None
+    for year in inv_conf_per_year.keys():
+        print("-" * 60)
+        print(f"Reading year {year}.")
+        print("-" * 60)
+        df_year = None
+        for page in inv_conf_per_year[year]["page_defs"].keys():
+            print(f"Reading table from page {page}.")
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
+                columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
+                flavor="stream",
+                split_text=True,
+            )
+            print("Reading complete.")
+
+            df_page = tables_inventory_original[0].df
+
+            if df_year is None:
+                df_year = df_page
+            else:
+                df_year = pd.concat(
+                    [df_year, df_page],
+                    axis=0,
+                    join="outer",
+                ).reset_index(drop=True)
+
+        print(f"Concatenating all tables for {year}.")
+
+        # fix content that spreads across multiple rows
+        if "rows_to_fix" in inv_conf_per_year[year]:
+            for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
+                print(f"Merge content for {n_rows=}")
+                df_year = fix_rows(
+                    df_year,
+                    rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
+                    col_to_use=0,
+                    n_rows=n_rows,
+                )
+
+        df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+
+        skip_rows = 11
+        df_year = pd.concat(
+            [df_header, df_year[skip_rows:]], axis=0, join="outer"
+        ).reset_index(drop=True)
 
-# ###
-# configuration
-# ###
-
-input_folder = downloaded_data_path / "UNFCCC" / "Mongolia" / "BUR2"
-output_folder = extracted_data_path / "UNFCCC" / "Mongolia"
-
-if not output_folder.exists():
-    output_folder.mkdir()
-
-pdf_file = "20231112_NIR_MGL.pdf"
-output_filename = "MNG_BUR2_2023_"
-category_column = f"category ({coords_terminologies['category']})"
-compression = dict(zlib=True, complevel=9)
-
-
-def repl(m):  # noqa: D103
-    return m.group("code")
-
-
-# ###
-# 1. Read in main tables
-# ###
-
-df_main = None
-for year in inv_conf_per_year.keys():
-    print("-" * 60)
-    print(f"Reading year {year}.")
-    print("-" * 60)
-    df_year = None
-    for page in inv_conf_per_year[year]["page_defs"].keys():
-        print(f"Reading table from page {page}.")
-        tables_inventory_original = camelot.read_pdf(
-            str(input_folder / pdf_file),
-            pages=page,
-            table_areas=inv_conf_per_year[year]["page_defs"][page]["area"],
-            columns=inv_conf_per_year[year]["page_defs"][page]["cols"],
-            flavor="stream",
-            split_text=True,
+        df_year = pm2.pm2io.nir_add_unit_information(
+            df_year,
+            unit_row=inv_conf["unit_row"],
+            entity_row=inv_conf["entity_row"],
+            regexp_entity=".*",
+            regexp_unit=".*",
+            default_unit="Gg",
         )
-        print("Reading complete.")
 
-        df_page = tables_inventory_original[0].df
+        print("Added unit information.")
 
-        if df_year is None:
-            df_year = df_page
-        else:
-            df_year = pd.concat(
-                [df_year, df_page],
-                axis=0,
-                join="outer",
-            ).reset_index(drop=True)
+        # set index
+        df_year = df_year.set_index(inv_conf["index_cols"])
 
-    print(f"Concatenating all tables for {year}.")
-
-    # fix content that spreads across multiple rows
-    if "rows_to_fix" in inv_conf_per_year[year]:
-        for n_rows in inv_conf_per_year[year]["rows_to_fix"].keys():
-            print(f"Merge content for {n_rows=}")
-            df_year = fix_rows(
-                df_year,
-                rows_to_fix=inv_conf_per_year[year]["rows_to_fix"][n_rows],
-                col_to_use=0,
-                n_rows=n_rows,
-            )
+        # convert to long format
+        df_year_long = pm2.pm2io.nir_convert_df_to_long(
+            df_year, year, inv_conf["header_long"]
+        )
 
-    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
+        # extract from tuple
+        df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
 
-    skip_rows = 11
-    df_year = pd.concat(
-        [df_header, df_year[skip_rows:]], axis=0, join="outer"
-    ).reset_index(drop=True)
+        # prep for conversion to PM2 IF and native format
+        # make a copy of the categories row
+        df_year_long["category"] = df_year_long["orig_cat_name"]
 
-    df_year = pm2.pm2io.nir_add_unit_information(
-        df_year,
-        unit_row=inv_conf["unit_row"],
-        entity_row=inv_conf["entity_row"],
-        regexp_entity=".*",
-        regexp_unit=".*",
-        default_unit="Gg",
-    )
+        # replace cat names by codes in col "category"
+        # first the manual replacements
 
-    print("Added unit information.")
+        df_year_long["category"] = df_year_long["category"].replace(
+            inv_conf["cat_codes_manual"]
+        )
 
-    # set index
-    df_year = df_year.set_index(inv_conf["index_cols"])
+        df_year_long["category"] = df_year_long["category"].str.replace(".", "")
 
-    # convert to long format
-    df_year_long = pm2.pm2io.nir_convert_df_to_long(
-        df_year, year, inv_conf["header_long"]
-    )
+        # then the regex replacements
+        df_year_long["category"] = df_year_long["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-    # extract from tuple
-    df_year_long["orig_cat_name"] = df_year_long["orig_cat_name"].str[0]
+        df_year_long = df_year_long.reset_index(drop=True)
 
-    # prep for conversion to PM2 IF and native format
-    # make a copy of the categories row
-    df_year_long["category"] = df_year_long["orig_cat_name"]
+        df_year_long["data"] = df_year_long["data"].str.replace(",", "")
 
-    # replace cat names by codes in col "category"
-    # first the manual replacements
+        # make sure all col headers are str
+        df_year_long.columns = df_year_long.columns.map(str)
 
-    df_year_long["category"] = df_year_long["category"].replace(
-        inv_conf["cat_codes_manual"]
-    )
+        df_year_long = df_year_long.drop(columns=["orig_cat_name"])
 
-    df_year_long["category"] = df_year_long["category"].str.replace(".", "")
+        if df_main is None:
+            df_main = df_year_long
+        else:
+            df_main = pd.concat(
+                [df_main, df_year_long],
+                axis=0,
+                join="outer",
+            ).reset_index(drop=True)
 
-    # then the regex replacements
-    df_year_long["category"] = df_year_long["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+    ### convert to interchange format ###
+    print("Converting to interchange format.")
+    df_main_IF = pm2.pm2io.convert_long_dataframe_if(
+        df_main,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_year_long = df_year_long.reset_index(drop=True)
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
 
-    df_year_long["data"] = df_year_long["data"].str.replace(",", "")
+    # ###
+    # 2. Read in trend tables
+    # ###
 
-    # make sure all col headers are str
-    df_year_long.columns = df_year_long.columns.map(str)
+    df_trend = None
+    for entity in inv_conf_per_entity.keys():
+        print("-" * 60)
+        print(f"Reading entity {entity}.")
 
-    df_year_long = df_year_long.drop(columns=["orig_cat_name"])
+        df_entity = None
 
-    if df_main is None:
-        df_main = df_year_long
-    else:
-        df_main = pd.concat(
-            [df_main, df_year_long],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
+        for page in inv_conf_per_entity[entity]["page_defs"].keys():
+            print(f"Reading page {page}.")
 
-### convert to interchange format ###
-print("Converting to interchange format.")
-df_main_IF = pm2.pm2io.convert_long_dataframe_if(
-    df_main,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping,
-    filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
-### convert to primap2 format ###
-print("Converting to primap2 format.")
-data_main_pm2 = pm2.pm2io.from_interchange_format(df_main_IF)
-
-# ###
-# 2. Read in trend tables
-# ###
-
-df_trend = None
-for entity in inv_conf_per_entity.keys():
-    print("-" * 60)
-    print(f"Reading entity {entity}.")
-
-    df_entity = None
+            tables_inventory_original = camelot.read_pdf(
+                str(input_folder / pdf_file),
+                pages=page,
+                table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
+                columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
+                flavor="stream",
+                split_text=True,
+            )
+            df_page = tables_inventory_original[0].df
+
+            if df_entity is None:
+                df_entity = df_page
+            else:
+                df_entity = pd.concat(
+                    [df_entity, df_page],
+                    axis=0,
+                    join="outer",
+                ).reset_index(drop=True)
+            print(f"adding table from page {page}.")
+
+        if "rows_to_fix" in inv_conf_per_entity[entity]:
+            for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
+                print(f"Merge content for {n_rows=}")
+                df_entity = fix_rows(
+                    df_entity,
+                    rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
+                    col_to_use=0,
+                    n_rows=n_rows,
+                )
+
+        df_entity.columns = df_entity.iloc[0, :]
+        df_entity = df_entity[1:]
+
+        # unit is always Gg
+        df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
+
+        # only one entity per table
+        df_entity.loc[:, "entity"] = entity
+
+        # TODO: Fix pandas "set value on slice of copy" warning
+        df_entity.loc[:, "category"] = df_entity.loc[
+            :, inv_conf_per_entity[entity]["category_column"]
+        ]
+
+        if "rows_to_drop" in inv_conf_per_entity[entity]:
+            for row in inv_conf_per_entity[entity]["rows_to_drop"]:
+                row_to_delete = df_entity.index[df_entity["category"] == row][0]
+                df_entity = df_entity.drop(index=row_to_delete)
+
+        df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
+            inv_conf_per_entity[entity]["cat_codes_manual"]
+        )
 
-    for page in inv_conf_per_entity[entity]["page_defs"].keys():
-        print(f"Reading page {page}.")
+        df_entity.loc[:, "category"] = df_entity["category"].str.replace(
+            inv_conf["cat_code_regexp"], repl, regex=True
+        )
 
-        tables_inventory_original = camelot.read_pdf(
-            str(input_folder / pdf_file),
-            pages=page,
-            table_areas=inv_conf_per_entity[entity]["page_defs"][page]["area"],
-            columns=inv_conf_per_entity[entity]["page_defs"][page]["cols"],
-            flavor="stream",
-            split_text=True,
+        df_entity = df_entity.drop(
+            columns=inv_conf_per_entity[entity]["columns_to_drop"]
         )
-        df_page = tables_inventory_original[0].df
 
-        if df_entity is None:
-            df_entity = df_page
+        for year in inv_conf_per_entity[entity]["years"]:
+            df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
+
+        if df_trend is None:
+            df_trend = df_entity
         else:
-            df_entity = pd.concat(
-                [df_entity, df_page],
+            df_trend = pd.concat(
+                [df_trend, df_entity],
                 axis=0,
                 join="outer",
             ).reset_index(drop=True)
-        print(f"adding table from page {page}.")
-
-    if "rows_to_fix" in inv_conf_per_entity[entity]:
-        for n_rows in inv_conf_per_entity[entity]["rows_to_fix"].keys():
-            print(f"Merge content for {n_rows=}")
-            df_entity = fix_rows(
-                df_entity,
-                rows_to_fix=inv_conf_per_entity[entity]["rows_to_fix"][n_rows],
-                col_to_use=0,
-                n_rows=n_rows,
-            )
-
-    df_entity.columns = df_entity.iloc[0, :]
-    df_entity = df_entity[1:]
-
-    # unit is always Gg
-    df_entity.loc[:, "unit"] = inv_conf_per_entity[entity]["unit"]
-
-    # only one entity per table
-    df_entity.loc[:, "entity"] = entity
-
-    # TODO: Fix pandas "set value on slice of copy" warning
-    df_entity.loc[:, "category"] = df_entity.loc[
-        :, inv_conf_per_entity[entity]["category_column"]
-    ]
-
-    if "rows_to_drop" in inv_conf_per_entity[entity]:
-        for row in inv_conf_per_entity[entity]["rows_to_drop"]:
-            row_to_delete = df_entity.index[df_entity["category"] == row][0]
-            df_entity = df_entity.drop(index=row_to_delete)
-
-    df_entity.loc[:, "category"] = df_entity.loc[:, "category"].replace(
-        inv_conf_per_entity[entity]["cat_codes_manual"]
-    )
 
-    df_entity.loc[:, "category"] = df_entity["category"].str.replace(
-        inv_conf["cat_code_regexp"], repl, regex=True
+    ### convert to interchange format ###
+    df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
+        data_wide=df_trend,
+        coords_cols=coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # filter_remove=filter_remove,
+        meta_data=meta_data,
+        convert_str=True,
+        time_format="%Y",
     )
 
-    df_entity = df_entity.drop(columns=inv_conf_per_entity[entity]["columns_to_drop"])
+    ### convert to primap2 format ###
+    print("Converting to primap2 format.")
+    data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
 
-    for year in inv_conf_per_entity[entity]["years"]:
-        df_entity.loc[:, year] = df_entity[year].str.replace(",", "")
+    # ###
+    # Merge main and trend tables.
+    # ###
 
-    if df_trend is None:
-        df_trend = df_entity
-    else:
-        df_trend = pd.concat(
-            [df_trend, df_entity],
-            axis=0,
-            join="outer",
-        ).reset_index(drop=True)
-
-### convert to interchange format ###
-df_trend_IF = pm2.pm2io.convert_wide_dataframe_if(
-    data_wide=df_trend,
-    coords_cols=coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping,
-    # filter_remove=filter_remove,
-    meta_data=meta_data,
-    convert_str=True,
-    time_format="%Y",
-)
-
-### convert to primap2 format ###
-print("Converting to primap2 format.")
-data_trend_pm2 = pm2.pm2io.from_interchange_format(df_trend_IF)
+    print("Merging main and trend table.")
+    data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
 
-# ###
-# Merge main and trend tables.
-# ###
+    # ###
+    # Save raw data to IF and native format.
+    # ###
 
-print("Merging main and trend table.")
-data_pm2 = data_main_pm2.pr.merge(data_trend_pm2, tolerance=1)
+    data_if = data_pm2.pr.to_interchange_format()
 
-# ###
-# Save raw data to IF and native format.
-# ###
-
-data_if = data_pm2.pr.to_interchange_format()
-
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
-    data_if,
-)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"] + "_raw"),
+        data_if,
+    )
 
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
-    encoding=encoding,
-)
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies["category"] + "_raw.nc"),
+        encoding=encoding,
+    )
 
-# ###
-# Processing
-# ###
-
-data_proc_pm2 = process_data_for_country(
-    data_country=data_pm2,
-    entities_to_ignore=[],
-    gas_baskets=gas_baskets,
-    filter_dims=None,
-    cat_terminology_out=None,
-    category_conversion=None,
-    sectors_out=None,
-    processing_info_country=country_processing_step1,
-)
+    # ###
+    # Processing
+    # ###
+
+    data_proc_pm2 = process_data_for_country(
+        data_country=data_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        filter_dims=None,
+        cat_terminology_out=None,
+        category_conversion=None,
+        sectors_out=None,
+        processing_info_country=country_processing_step1,
+    )
 
-# ###
-# save processed data to IF and native format
-# ###
+    # ###
+    # save processed data to IF and native format
+    # ###
 
-terminology_proc = coords_terminologies["category"]
+    terminology_proc = coords_terminologies["category"]
 
-data_proc_if = data_proc_pm2.pr.to_interchange_format()
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
 
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + terminology_proc), data_proc_if
-)
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + terminology_proc), data_proc_if
+    )
 
-encoding = {var: compression for var in data_proc_pm2.data_vars}
-data_proc_pm2.pr.to_netcdf(
-    output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
-)
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        output_folder / (output_filename + terminology_proc + ".nc"), encoding=encoding
+    )
 
-print("Saved processed data.")
+    print("Saved processed data.")

+ 5 - 3
src/unfccc_ghg_data/unfccc_reader/Montenegro/read_MNE_BUR3_from_pdf.py

@@ -16,11 +16,12 @@ import re
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
 from primap2.pm2io._data_reading import matches_time_format
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
+from .config_mne_bur3 import aggregate_cats, cat_mapping, drop_data
+
 if __name__ == "__main__":
     # ###
     # configuration
@@ -232,7 +233,7 @@ if __name__ == "__main__":
     # rename the category col
     data_if_2006 = data_if_2006.rename(
         columns={
-            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"  # noqa: E501
         }
     )
     data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
@@ -276,7 +277,8 @@ if __name__ == "__main__":
             ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
-            # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+            # df_combine.insert(1, "cat_name_translation",
+            # aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(2, "orig_cat_name", "computed")
 
             df_combine = df_combine.reset_index()

+ 10 - 7
src/unfccc_ghg_data/unfccc_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -10,7 +10,11 @@ import copy
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_mar_bur3 import (
+from primap2.pm2io._data_reading import filter_data, matches_time_format
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
+from .config_mar_bur3 import (
     aggregate_cats,
     cat_mapping,
     header_defs,
@@ -18,9 +22,6 @@ from config_mar_bur3 import (
     table_defs,
     zero_cats,
 )
-from primap2.pm2io._data_reading import filter_data, matches_time_format
-
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
 if __name__ == "__main__":
     # ###
@@ -143,7 +144,8 @@ if __name__ == "__main__":
                 df_this_table.iloc[0, 1:] = ""
                 df_this_table.iloc[1 : last_shift_row + 1, 1:] = df_temp
 
-            # replace line breaks, long hyphens, double, and triple spaces in category names
+            # replace line breaks, long hyphens, double, and triple spaces in category
+            # names
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
             df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
@@ -263,7 +265,7 @@ if __name__ == "__main__":
     # rename the category col
     data_if_2006 = data_if_2006.rename(
         columns={
-            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"
+            f"category ({coords_terminologies['category']})": "category (IPCC2006_PRIMAP)"  # noqa: E501
         }
     )
     data_if_2006.attrs["attrs"]["cat"] = "category (IPCC2006_PRIMAP)"
@@ -307,7 +309,8 @@ if __name__ == "__main__":
             ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
-            # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+            # df_combine.insert(1, "cat_name_translation",
+            # aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(2, "orig_cat_name", "computed")
 
             df_combine = df_combine.reset_index()

+ 9 - 8
src/unfccc_ghg_data/unfccc_reader/Nigeria/read_NGA_BUR2_from_pdf.py

@@ -14,7 +14,15 @@ import numpy as np
 import pandas as pd
 import primap2 as pm2
 import xarray as xr
-from config_nga_bur2 import (
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    process_data_for_country,
+)
+
+from .config_nga_bur2 import (
     cat_code_regexp,
     cat_codes_manual,
     coords_cols,
@@ -35,13 +43,6 @@ from config_nga_bur2 import (
     year_inventory,
 )
 
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    gas_baskets,
-    process_data_for_country,
-)
-
 if __name__ == "__main__":
     # ###
     # configuration

+ 11 - 10
src/unfccc_ghg_data/unfccc_reader/Peru/read_PER_BUR3_from_pdf.py

@@ -11,7 +11,17 @@ import locale
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_per_bur3 import (
+from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    fix_rows,
+    gas_baskets,
+    process_data_for_country,
+)
+
+from .config_per_bur3 import (
     cat_code_regexp,
     cat_codes_manual,
     cat_conversion,
@@ -31,15 +41,6 @@ from config_per_bur3 import (
     table_defs,
     values_replacement,
 )
-from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    fix_rows,
-    gas_baskets,
-    process_data_for_country,
-)
 
 if __name__ == "__main__":
     ### general configuration

+ 5 - 4
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2021_Inventory_from_xlsx.py

@@ -12,7 +12,11 @@ import sys
 
 import pandas as pd
 import primap2 as pm2
-from config_kor_bur4 import (
+from primap2.pm2io._data_reading import filter_data, matches_time_format
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
+from .config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_before_mapping,
     cat_codes,
@@ -22,9 +26,6 @@ from config_kor_bur4 import (
     filter_remove_2006,
     filter_remove_after_agg,
 )
-from primap2.pm2io._data_reading import filter_data, matches_time_format
-
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
 if __name__ == "__main__":
     # ###

+ 5 - 4
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2022_Inventory_from_xlsx.py

@@ -11,7 +11,11 @@ import sys
 
 import pandas as pd
 import primap2 as pm2
-from config_kor_bur4 import (
+from primap2.pm2io._data_reading import filter_data, matches_time_format
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
+from .config_kor_bur4 import (
     aggregate_after_mapping,
     aggregate_before_mapping,
     cat_codes,
@@ -21,9 +25,6 @@ from config_kor_bur4 import (
     filter_remove_2006,
     filter_remove_after_agg,
 )
-from primap2.pm2io._data_reading import filter_data, matches_time_format
-
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
 if __name__ == "__main__":
     # ###

+ 0 - 384
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2023-Inventory_from_xlsx.py

@@ -1,384 +0,0 @@
-"""
-Read South Korea's 2023 Inventory from Excel file
-"""
-
-import os
-import sys
-
-import pandas as pd
-import primap2 as pm2
-from config_KOR_INV2023 import (
-    aggregate_after_mapping,
-    aggregate_before_mapping,
-    cat_codes,
-    cat_mapping,
-    cat_name_translations,
-    coords_terminologies_2006,
-    filter_remove_2006,
-    filter_remove_after_agg,
-    fix_rows,
-)
-from primap2.pm2io._data_reading import filter_data, matches_time_format
-
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
-
-# ###
-# configuration
-# ###
-input_folder = (
-    downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2023-Inventory"
-)
-output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
-if not output_folder.exists():
-    output_folder.mkdir()
-
-output_filename = "KOR_2023-Inventory_2023_"
-
-inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2021).xlsx"
-years_to_read = range(1990, 2020 + 1)
-
-sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
-cols_to_read = range(1, 2021 - 1990 + 3)
-
-# columns for category UNFCCC_GHG_data and original category name
-index_cols = ["분야·부문/연도"]
-
-sheet_metadata = {
-    "entity": {
-        "온실가스": "KYOTOGHG (SARGWP100)",
-        "CO2": "CO2",
-        "CH4": "CH4 (SARGWP100)",
-        "N2O": "N2O (SARGWP100)",
-        "HFCs": "HFCS (SARGWP100)",
-        "PFCs": "PFCS (SARGWP100)",
-        "SF6": "SF6 (SARGWP100)",
-    },
-    "unit": {
-        "온실가스": "Gg CO2 / yr",
-        "CO2": "Gg CO2 / yr",
-        "CH4": "Gg CO2 / yr",
-        "N2O": "Gg CO2 / yr",
-        "HFCs": "Gg CO2 / yr",
-        "PFCs": "Gg CO2 / yr",
-        "SF6": "Gg CO2 / yr",
-    },
-}
-
-# definitions for conversion to interchange format
-time_format = "%Y"
-
-coords_cols = {
-    "category": "category",
-    "entity": "entity",
-    "unit": "unit",
-}
-
-add_coords_cols = {
-    "orig_cat_name": ["orig_cat_name", "category"],
-    "cat_name_translation": ["cat_name_translation", "category"],
-}
-
-coords_terminologies = {
-    "area": "ISO3",
-    "category": "IPCC1996_KOR_INV",
-    "scenario": "PRIMAP",
-}
-
-coords_defaults = {
-    "source": "KOR-GHG-Inventory",
-    "provenance": "measured",
-    "area": "KOR",
-    "scenario": "INV2023",
-}
-
-coords_value_mapping = {
-    "cat_name_translation": cat_name_translations,
-    "category": cat_codes,
-}
-
-# filtering after IF creation to be able to use the IPCC codes
-filter_remove = {
-    "f1": {
-        "category (IPCC1996_KOR_INV)": "\\IGNORE",
-    },
-    # "livestock": { # temp until double cat name problem is solved
-    #     "category (IPCC1996_KOR_INV)": [
-    #         '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
-    #         '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
-    #     ]
-    # }
-}
-
-filter_keep = {}
-
-meta_data = {
-    "references": "http://www.gir.go.kr/home/board/read.do?pagerOffset=0&maxPageItems=10&maxIndexPages="
-    "10&searchKey=&searchValue=&menuId=36&boardId=62&boardMasterId=2&boardCategoryId=",
-    "rights": "",
-    "contact": "mail@johannes-guetschow.de",
-    "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2023",
-    "comment": "Read fom xlsx file by Johannes Gütschow",
-    "institution": "Republic of Korea, Ministry of Environment, Greenhouse Gas Inventory and Research Center",
-}
-
-
-cols_for_space_stripping = []
-
-compression = dict(zlib=True, complevel=9)
-
-# ###
-# start data reading
-# ###
-
-# change working directory to script directory for proper folder names
-script_path = os.path.abspath(sys.argv[0])
-script_dir_name = os.path.dirname(script_path)
-os.chdir(script_dir_name)
-
-df_all = None
-
-for sheet in sheets_to_read:
-    print(f"Reading sheet {sheet}.")
-    # read current sheet (one sheet per gas)
-    df_current = pd.read_excel(
-        input_folder / inventory_file,
-        sheet_name=sheet,
-        skiprows=3,
-        nrows=146,
-        usecols=cols_to_read,
-        engine="openpyxl",
-    )
-    # drop all rows where the index cols (category UNFCCC_GHG_data and name) are both NaN
-    # as without one of them there is no category information
-    df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
-    # set index. necessary for the stack operation in the conversion to long format
-    # df_current = df_current.set_index(index_cols)
-    # make sure all col headers are str
-    df_current.columns = df_current.columns.map(str)
-
-    # fix the double category issue in livestock
-    lastrow = None
-    for i, row in df_current.iterrows():
-        if row["분야·부문/연도"] in fix_rows:
-            if lastrow == "A.  장내발효":
-                df_current.iloc[i]["분야·부문/연도"] = f'A.{df_current.iloc[i]["분야·부문/연도"]}'
-            elif lastrow == "B.  가축분뇨처리":
-                df_current.iloc[i]["분야·부문/연도"] = f'B.{df_current.iloc[i]["분야·부문/연도"]}'
-            else:
-                raise ValueError(  # noqa: TRY003
-                    f'Row to fix, but no fix defined {lastrow}, {row["분야·부문/연도"]}'
-                )
-        else:
-            lastrow = row["분야·부문/연도"]
-    # add columns
-    for col in sheet_metadata.keys():
-        df_current.insert(1, col, sheet_metadata[col][sheet])
-    # aggregate to one df
-    if df_all is None:
-        df_all = df_current
-    else:
-        df_all = pd.concat([df_all, df_current])
-
-df_all = df_all.reset_index(drop=True)
-# rename category col because filtering produces problems with korean col names
-df_all = df_all.rename(columns={"분야·부문/연도": "category"})
-
-# create copies of category col for further processing
-df_all["orig_cat_name"] = df_all["category"]
-df_all["cat_name_translation"] = df_all["category"]
-
-
-# ###
-# convert to PRIMAP2 interchange format
-# ###
-data_if = pm2.pm2io.convert_wide_dataframe_if(
-    df_all,
-    coords_cols=coords_cols,
-    add_coords_cols=add_coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies,
-    coords_value_mapping=coords_value_mapping,
-    # coords_value_filling=coords_value_filling,
-    # filter_remove=filter_remove,
-    # filter_keep=filter_keep,
-    meta_data=meta_data,
-    convert_str=True,
-    copy_df=True,  # we need the unchanged DF for the conversion step
-)
-
-filter_data(data_if, filter_remove=filter_remove)
-
-# conversion to PRIMAP2 native format
-data_pm2 = pm2.pm2io.from_interchange_format(data_if)
-# convert back to IF to have units in the fixed format
-data_pm2 = data_pm2.reset_coords(["orig_cat_name", "cat_name_translation"], drop=True)
-data_if = data_pm2.pr.to_interchange_format()
-
-# ###
-# save data to IF and native format
-# ###
-if not output_folder.exists():
-    output_folder.mkdir()
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"]), data_if
-)
-
-data_pm2 = pm2.pm2io.from_interchange_format(data_if)
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
-    encoding=encoding,
-)
-
-# ###
-# conversion to ipcc 2006 categories
-# ###
-
-
-data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
-    df_all,
-    coords_cols=coords_cols,
-    add_coords_cols=add_coords_cols,
-    coords_defaults=coords_defaults,
-    coords_terminologies=coords_terminologies_2006,
-    coords_value_mapping=coords_value_mapping,
-    meta_data=meta_data,
-    convert_str=True,
-    copy_df=True,  # don't mess up the dataframe when testing
-)
-
-cat_label = "category (" + coords_terminologies_2006["category"] + ")"
-# agg before mapping
-
-for cat_to_agg in aggregate_before_mapping:
-    mask = data_if_2006[cat_label].isin(aggregate_before_mapping[cat_to_agg]["sources"])
-    df_test = data_if_2006[mask]
-
-    if len(df_test) > 0:
-        print(f"Aggregating category {cat_to_agg}")
-        df_combine = df_test.copy(deep=True)
-
-        time_format = "%Y"
-        time_columns = [
-            col
-            for col in df_combine.columns.to_numpy()
-            if matches_time_format(col, time_format)
-        ]
-
-        for col in time_columns:
-            df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
-
-        df_combine = df_combine.groupby(
-            by=[
-                "source",
-                "scenario (PRIMAP)",
-                "provenance",
-                "area (ISO3)",
-                "entity",
-                "unit",
-            ]
-        ).sum()
-
-        df_combine = df_combine.drop(
-            columns=[
-                "category (IPCC2006_PRIMAP)",
-                "orig_cat_name",
-                "cat_name_translation",
-            ]
-        )
-        df_combine.insert(0, cat_label, cat_to_agg)
-        df_combine.insert(
-            1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
-        )
-
-        df_combine = df_combine.reset_index()
-
-        if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
-            filter_this_cat = {"f": {cat_label: cat_to_agg}}
-            filter_data(data_if_2006, filter_remove=filter_this_cat)
-
-        data_if_2006 = pd.concat([data_if_2006, df_combine])
-    else:
-        print(f"no data to aggregate category {cat_to_agg}")
-
-# filtering
-filter_data(data_if_2006, filter_remove=filter_remove_2006)
-
-# map 1 to 1 categories
-data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
-data_if_2006[cat_label].unique()
-
-# agg after mapping
-
-for cat_to_agg in aggregate_after_mapping:
-    mask = data_if_2006[cat_label].isin(aggregate_after_mapping[cat_to_agg]["sources"])
-    df_test = data_if_2006[mask]
-
-    if len(df_test) > 0:
-        print(f"Aggregating category {cat_to_agg}")
-        df_combine = df_test.copy(deep=True)
-
-        time_format = "%Y"
-        time_columns = [
-            col
-            for col in df_combine.columns.to_numpy()
-            if matches_time_format(col, time_format)
-        ]
-
-        for col in time_columns:
-            df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
-
-        df_combine = df_combine.groupby(
-            by=[
-                "source",
-                "scenario (PRIMAP)",
-                "provenance",
-                "area (ISO3)",
-                "entity",
-                "unit",
-            ]
-        ).sum()
-
-        df_combine = df_combine.drop(
-            columns=[
-                "category (IPCC2006_PRIMAP)",
-                "orig_cat_name",
-                "cat_name_translation",
-            ]
-        )
-        df_combine.insert(0, cat_label, cat_to_agg)
-        df_combine.insert(
-            1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
-        )
-
-        df_combine = df_combine.reset_index()
-
-        if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
-            filter_this_cat = {"f": {cat_label: cat_to_agg}}
-            filter_data(data_if_2006, filter_remove=filter_this_cat)
-
-        data_if_2006 = pd.concat([data_if_2006, df_combine])
-    else:
-        print(f"no data to aggregate category {cat_to_agg}")
-
-
-# conversion to PRIMAP2 native format
-data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
-# convert back to IF to have units in the fixed format
-data_pm2_2006 = data_pm2_2006.reset_coords(
-    ["orig_cat_name", "cat_name_translation"], drop=True
-)
-data_if_2006 = data_pm2_2006.pr.to_interchange_format()
-# save IPCC2006 data
-
-filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies_2006["category"]),
-    data_if_2006,
-)
-
-encoding = {var: compression for var in data_pm2_2006.data_vars}
-data_pm2_2006.pr.to_netcdf(
-    output_folder / (output_filename + coords_terminologies_2006["category"] + ".nc"),
-    encoding=encoding,
-)

+ 398 - 0
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_2023_Inventory_from_xlsx.py

@@ -0,0 +1,398 @@
+"""
+Read South Korea's 2023 Inventory from Excel file
+"""
+
+import os
+import sys
+
+import pandas as pd
+import primap2 as pm2
+from primap2.pm2io._data_reading import filter_data, matches_time_format
+
+from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
+
+from .config_KOR_INV2023 import (
+    aggregate_after_mapping,
+    aggregate_before_mapping,
+    cat_codes,
+    cat_mapping,
+    cat_name_translations,
+    coords_terminologies_2006,
+    filter_remove_2006,
+    filter_remove_after_agg,
+    fix_rows,
+)
+
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+    input_folder = (
+        downloaded_data_path / "non-UNFCCC" / "Republic_of_Korea" / "2023-Inventory"
+    )
+    output_folder = extracted_data_path / "non-UNFCCC" / "Republic_of_Korea"
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    output_filename = "KOR_2023-Inventory_2023_"
+
+    inventory_file = "Republic_of_Korea_National_GHG_Inventory_(1990_2021).xlsx"
+    years_to_read = range(1990, 2020 + 1)
+
+    sheets_to_read = ["온실가스", "CO2", "CH4", "N2O", "HFCs", "PFCs", "SF6"]
+    cols_to_read = range(1, 2021 - 1990 + 3)
+
+    # columns for category UNFCCC_GHG_data and original category name
+    index_cols = ["분야·부문/연도"]
+
+    sheet_metadata = {
+        "entity": {
+            "온실가스": "KYOTOGHG (SARGWP100)",
+            "CO2": "CO2",
+            "CH4": "CH4 (SARGWP100)",
+            "N2O": "N2O (SARGWP100)",
+            "HFCs": "HFCS (SARGWP100)",
+            "PFCs": "PFCS (SARGWP100)",
+            "SF6": "SF6 (SARGWP100)",
+        },
+        "unit": {
+            "온실가스": "Gg CO2 / yr",
+            "CO2": "Gg CO2 / yr",
+            "CH4": "Gg CO2 / yr",
+            "N2O": "Gg CO2 / yr",
+            "HFCs": "Gg CO2 / yr",
+            "PFCs": "Gg CO2 / yr",
+            "SF6": "Gg CO2 / yr",
+        },
+    }
+
+    # definitions for conversion to interchange format
+    time_format = "%Y"
+
+    coords_cols = {
+        "category": "category",
+        "entity": "entity",
+        "unit": "unit",
+    }
+
+    add_coords_cols = {
+        "orig_cat_name": ["orig_cat_name", "category"],
+        "cat_name_translation": ["cat_name_translation", "category"],
+    }
+
+    coords_terminologies = {
+        "area": "ISO3",
+        "category": "IPCC1996_KOR_INV",
+        "scenario": "PRIMAP",
+    }
+
+    coords_defaults = {
+        "source": "KOR-GHG-Inventory",
+        "provenance": "measured",
+        "area": "KOR",
+        "scenario": "INV2023",
+    }
+
+    coords_value_mapping = {
+        "cat_name_translation": cat_name_translations,
+        "category": cat_codes,
+    }
+
+    # filtering after IF creation to be able to use the IPCC codes
+    filter_remove = {
+        "f1": {
+            "category (IPCC1996_KOR_INV)": "\\IGNORE",
+        },
+        # "livestock": { # temp until double cat name problem is solved
+        #     "category (IPCC1996_KOR_INV)": [
+        #         '4.B.1', '4.B.10', '4.B.2', '4.B.3', '4.B.4',
+        #         '4.B.5', '4.B.6', '4.B.7', '4.B.8', '4.B.9',
+        #     ]
+        # }
+    }
+
+    filter_keep = {}
+
+    meta_data = {
+        "references": "http://www.gir.go.kr/home/board/read.do?pagerOffset=0"
+        "&maxPageItems=10&maxIndexPages="
+        "10&searchKey=&searchValue=&menuId=36&boardId=62&boardMasterId=2"
+        "&boardCategoryId=",
+        "rights": "",
+        "contact": "mail@johannes-guetschow.de",
+        "title": "Republic of Korea: National Greenhouse Gas Inventory Report 2023",
+        "comment": "Read fom xlsx file by Johannes Gütschow",
+        "institution": "Republic of Korea, Ministry of Environment, Greenhouse "
+        "Gas Inventory and Research Center",
+    }
+
+    cols_for_space_stripping = []
+
+    compression = dict(zlib=True, complevel=9)
+
+    # ###
+    # start data reading
+    # ###
+
+    # change working directory to script directory for proper folder names
+    script_path = os.path.abspath(sys.argv[0])
+    script_dir_name = os.path.dirname(script_path)
+    os.chdir(script_dir_name)
+
+    df_all = None
+
+    for sheet in sheets_to_read:
+        print(f"Reading sheet {sheet}.")
+        # read current sheet (one sheet per gas)
+        df_current = pd.read_excel(
+            input_folder / inventory_file,
+            sheet_name=sheet,
+            skiprows=3,
+            nrows=146,
+            usecols=cols_to_read,
+            engine="openpyxl",
+        )
+        # drop all rows where the index cols (category UNFCCC_GHG_data and name)
+        # are both NaN
+        # as without one of them there is no category information
+        df_current = df_current.dropna(axis=0, how="all", subset=index_cols)
+        # set index. necessary for the stack operation in the conversion to long format
+        # df_current = df_current.set_index(index_cols)
+        # make sure all col headers are str
+        df_current.columns = df_current.columns.map(str)
+
+        # fix the double category issue in livestock
+        lastrow = None
+        for i, row in df_current.iterrows():
+            if row["분야·부문/연도"] in fix_rows:
+                if lastrow == "A.  장내발효":
+                    df_current.iloc[i][
+                        "분야·부문/연도"
+                    ] = f'A.{df_current.iloc[i]["분야·부문/연도"]}'
+                elif lastrow == "B.  가축분뇨처리":
+                    df_current.iloc[i][
+                        "분야·부문/연도"
+                    ] = f'B.{df_current.iloc[i]["분야·부문/연도"]}'
+                else:
+                    raise ValueError(  # noqa: TRY003
+                        f'Row to fix, but no fix defined {lastrow}, '
+                        f'{row["분야·부문/연도"]}'
+                    )
+            else:
+                lastrow = row["분야·부문/연도"]
+        # add columns
+        for col in sheet_metadata.keys():
+            df_current.insert(1, col, sheet_metadata[col][sheet])
+        # aggregate to one df
+        if df_all is None:
+            df_all = df_current
+        else:
+            df_all = pd.concat([df_all, df_current])
+
+    df_all = df_all.reset_index(drop=True)
+    # rename category col because filtering produces problems with korean col names
+    df_all = df_all.rename(columns={"분야·부문/연도": "category"})
+
+    # create copies of category col for further processing
+    df_all["orig_cat_name"] = df_all["category"]
+    df_all["cat_name_translation"] = df_all["category"]
+
+    # ###
+    # convert to PRIMAP2 interchange format
+    # ###
+    data_if = pm2.pm2io.convert_wide_dataframe_if(
+        df_all,
+        coords_cols=coords_cols,
+        add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies,
+        coords_value_mapping=coords_value_mapping,
+        # coords_value_filling=coords_value_filling,
+        # filter_remove=filter_remove,
+        # filter_keep=filter_keep,
+        meta_data=meta_data,
+        convert_str=True,
+        copy_df=True,  # we need the unchanged DF for the conversion step
+    )
+
+    filter_data(data_if, filter_remove=filter_remove)
+
+    # conversion to PRIMAP2 native format
+    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+    # convert back to IF to have units in the fixed format
+    data_pm2 = data_pm2.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
+    data_if = data_pm2.pr.to_interchange_format()
+
+    # ###
+    # save data to IF and native format
+    # ###
+    if not output_folder.exists():
+        output_folder.mkdir()
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
+
+    data_pm2 = pm2.pm2io.from_interchange_format(data_if)
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding=encoding,
+    )
+
+    # ###
+    # conversion to ipcc 2006 categories
+    # ###
+
+    data_if_2006 = pm2.pm2io.convert_wide_dataframe_if(
+        df_all,
+        coords_cols=coords_cols,
+        add_coords_cols=add_coords_cols,
+        coords_defaults=coords_defaults,
+        coords_terminologies=coords_terminologies_2006,
+        coords_value_mapping=coords_value_mapping,
+        meta_data=meta_data,
+        convert_str=True,
+        copy_df=True,  # don't mess up the dataframe when testing
+    )
+
+    cat_label = "category (" + coords_terminologies_2006["category"] + ")"
+    # agg before mapping
+
+    for cat_to_agg in aggregate_before_mapping:
+        mask = data_if_2006[cat_label].isin(
+            aggregate_before_mapping[cat_to_agg]["sources"]
+        )
+        df_test = data_if_2006[mask]
+
+        if len(df_test) > 0:
+            print(f"Aggregating category {cat_to_agg}")
+            df_combine = df_test.copy(deep=True)
+
+            time_format = "%Y"
+            time_columns = [
+                col
+                for col in df_combine.columns.to_numpy()
+                if matches_time_format(col, time_format)
+            ]
+
+            for col in time_columns:
+                df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
+
+            df_combine = df_combine.groupby(
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
+            df_combine.insert(0, cat_label, cat_to_agg)
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_before_mapping[cat_to_agg]["name"]
+            )
+
+            df_combine = df_combine.reset_index()
+
+            if cat_to_agg in aggregate_before_mapping[cat_to_agg]["sources"]:
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
+                filter_data(data_if_2006, filter_remove=filter_this_cat)
+
+            data_if_2006 = pd.concat([data_if_2006, df_combine])
+        else:
+            print(f"no data to aggregate category {cat_to_agg}")
+
+    # filtering
+    filter_data(data_if_2006, filter_remove=filter_remove_2006)
+
+    # map 1 to 1 categories
+    data_if_2006 = data_if_2006.replace({cat_label: cat_mapping})
+    data_if_2006[cat_label].unique()
+
+    # agg after mapping
+
+    for cat_to_agg in aggregate_after_mapping:
+        mask = data_if_2006[cat_label].isin(
+            aggregate_after_mapping[cat_to_agg]["sources"]
+        )
+        df_test = data_if_2006[mask]
+
+        if len(df_test) > 0:
+            print(f"Aggregating category {cat_to_agg}")
+            df_combine = df_test.copy(deep=True)
+
+            time_format = "%Y"
+            time_columns = [
+                col
+                for col in df_combine.columns.to_numpy()
+                if matches_time_format(col, time_format)
+            ]
+
+            for col in time_columns:
+                df_combine[col] = pd.to_numeric(df_combine[col], errors="coerce")
+
+            df_combine = df_combine.groupby(
+                by=[
+                    "source",
+                    "scenario (PRIMAP)",
+                    "provenance",
+                    "area (ISO3)",
+                    "entity",
+                    "unit",
+                ]
+            ).sum()
+
+            df_combine = df_combine.drop(
+                columns=[
+                    "category (IPCC2006_PRIMAP)",
+                    "orig_cat_name",
+                    "cat_name_translation",
+                ]
+            )
+            df_combine.insert(0, cat_label, cat_to_agg)
+            df_combine.insert(
+                1, "orig_cat_name", aggregate_after_mapping[cat_to_agg]["name"]
+            )
+
+            df_combine = df_combine.reset_index()
+
+            if cat_to_agg in aggregate_after_mapping[cat_to_agg]["sources"]:
+                filter_this_cat = {"f": {cat_label: cat_to_agg}}
+                filter_data(data_if_2006, filter_remove=filter_this_cat)
+
+            data_if_2006 = pd.concat([data_if_2006, df_combine])
+        else:
+            print(f"no data to aggregate category {cat_to_agg}")
+
+    # conversion to PRIMAP2 native format
+    data_pm2_2006 = pm2.pm2io.from_interchange_format(data_if_2006)
+    # convert back to IF to have units in the fixed format
+    data_pm2_2006 = data_pm2_2006.reset_coords(
+        ["orig_cat_name", "cat_name_translation"], drop=True
+    )
+    data_if_2006 = data_pm2_2006.pr.to_interchange_format()
+    # save IPCC2006 data
+
+    filter_data(data_if_2006, filter_remove=filter_remove_after_agg)
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies_2006["category"]),
+        data_if_2006,
+    )
+
+    encoding = {var: compression for var in data_pm2_2006.data_vars}
+    data_pm2_2006.pr.to_netcdf(
+        output_folder
+        / (output_filename + coords_terminologies_2006["category"] + ".nc"),
+        encoding=encoding,
+    )

+ 2 - 1
src/unfccc_ghg_data/unfccc_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py

@@ -11,11 +11,12 @@ import sys
 
 import pandas as pd
 import primap2 as pm2
-from config_kor_bur4 import cat_codes, cat_name_translations
 from primap2.pm2io._data_reading import filter_data
 
 from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
+from .config_kor_bur4 import cat_codes, cat_name_translations
+
 if __name__ == "__main__":
     # ###
     # configuration

+ 17 - 9
src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2022_Inventory_from_pdf.py

@@ -12,16 +12,20 @@ import copy
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_twn_nir2022 import (
+from primap2.pm2io._data_reading import matches_time_format
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    make_wide_table,
+)
+
+from .config_twn_nir2022 import (
     fix_rows,
     gwp_to_use,
-    make_wide_table,
     page_defs,
     table_defs,
 )
-from primap2.pm2io._data_reading import matches_time_format
-
-from unfccc_ghg_data.helper import downloaded_data_path, extracted_data_path
 
 if __name__ == "__main__":
     # ###
@@ -82,7 +86,8 @@ if __name__ == "__main__":
         "rights": "",
         "contact": "mail@johannes-guetschow.de",
         "title": "2022 Republic of China - National Greenhouse Gas Report",
-        "comment": "Read fom pdf file and converted to PRIMAP2 format by Johannes Gütschow",
+        "comment": "Read fom pdf file and converted to PRIMAP2 format by "
+        "Johannes Gütschow",
         "institution": "Republic of China - Environmental Protection Administration",
     }
 
@@ -186,7 +191,8 @@ if __name__ == "__main__":
         for col in table_def["rows_to_fix"].keys():
             for n_rows in table_def["rows_to_fix"][col].keys():
                 print(f"Fixing {col}, {n_rows}")
-                # replace line breaks, long hyphens, double, and triple spaces in category names
+                # replace line breaks, long hyphens, double, and triple spaces in
+                # category names
                 df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
                     "\n", " "
                 )
@@ -358,7 +364,8 @@ if __name__ == "__main__":
             ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
-            # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+            # df_combine.insert(1, "cat_name_translation",
+            # aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(2, "orig_cat_name", "computed")
 
             df_combine = df_combine.reset_index()
@@ -404,7 +411,8 @@ if __name__ == "__main__":
             ).sum(min_count=1)
 
             df_combine.insert(0, "category (IPCC2006_PRIMAP)", cat_to_agg)
-            # df_combine.insert(1, "cat_name_translation", aggregate_cats[cat_to_agg]["name"])
+            # df_combine.insert(1, "cat_name_translation",
+            # aggregate_cats[cat_to_agg]["name"])
             # df_combine.insert(2, "orig_cat_name", "computed")
 
             df_combine = df_combine.reset_index()

+ 0 - 258
src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2023-Inventory_from_pdf.py

@@ -1,258 +0,0 @@
-"""
-Read data from Taiwan's 2023 national inventory
-
-Data are read from the english summary pdf
-"""
-
-import copy
-
-import camelot
-import pandas as pd
-import primap2 as pm2
-from config_twn_nir2022 import fix_rows
-from config_twn_nir2023 import (
-    add_coords_cols,
-    basket_copy,
-    cat_code_regexp,
-    cat_conversion,
-    coords_cols,
-    coords_defaults,
-    coords_terminologies,
-    coords_value_mapping,
-    meta_data,
-    page_defs,
-    table_defs,
-    terminology_proc,
-)
-from primap2.pm2io._data_reading import matches_time_format
-
-from unfccc_ghg_data.helper import (
-    compression,
-    downloaded_data_path,
-    extracted_data_path,
-    gas_baskets,
-    make_wide_table,
-    process_data_for_country,
-)
-
-# ###
-# configuration
-# ###
-input_folder = downloaded_data_path / "non-UNFCCC" / "Taiwan" / "2023_NIR"
-output_folder = extracted_data_path / "non-UNFCCC" / "Taiwan"
-if not output_folder.exists():
-    output_folder.mkdir()
-
-output_filename = "TWN_inventory_2023_"
-inventory_file = "2023_NIR_executive_summary_english.pdf"
-
-
-def repl(m):  # noqa: D103
-    return m.group("UNFCCC_GHG_data")
-
-
-# ###
-# read the tables from pdf
-# ###
-
-all_tables = []
-for page in page_defs:
-    print(f"Reading from page {page}")
-    new_tables = camelot.read_pdf(
-        str(input_folder / inventory_file),
-        pages=page,
-        **page_defs[page],
-    )
-    for table in new_tables:
-        all_tables.append(table.df)
-
-
-# ###
-# convert tables to primap2 format
-# ###
-data_pm2 = None
-for table_name in table_defs.keys():
-    print(f"Working on table: {table_name}")
-
-    table_def = copy.deepcopy(table_defs[table_name])
-    # combine all raw tables
-    df_this_table = all_tables[table_def["tables"][0]].copy(deep=True)
-    if len(table_def["tables"]) > 1:
-        for table in table_def["tables"][1:]:
-            df_this_table = pd.concat(
-                [df_this_table, all_tables[table]], axis=0, join="outer"
-            )
-
-    # fix for table ES3.6
-    if table_name == "ES3.6":
-        col_idx = df_this_table[0] == "Total CO Emission"
-        df_this_table.loc[col_idx, 1:] = ""
-        df_this_table.loc[col_idx, 0] = "Total CO2 Emission"
-
-    df_this_table = df_this_table.reset_index(drop=True)
-
-    # fix categories if necessary
-    if "fix_cats" in table_def.keys():
-        for col in table_def["fix_cats"]:
-            df_this_table[col] = df_this_table[col].replace(table_def["fix_cats"][col])
-
-    # fix rows
-    for col in table_def["rows_to_fix"].keys():
-        for n_rows in table_def["rows_to_fix"][col].keys():
-            print(f"Fixing {col}, {n_rows}")
-            # replace line breaks, long hyphens, double, and triple spaces in category names
-            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("\n", " ")
-            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("   ", " ")
-            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("  ", " ")
-            df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace("-", "-")
-            df_this_table = fix_rows(
-                df_this_table, table_def["rows_to_fix"][col][n_rows], col, n_rows
-            )
-
-    # split by entity
-    if "gas_splitting" in table_def.keys():
-        col_entity = [""] * len(df_this_table)
-        last_entity = ""
-        for i in range(0, len(df_this_table)):
-            current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
-            if current_header in table_def["gas_splitting"].keys():
-                last_entity = table_def["gas_splitting"][current_header]
-            col_entity[i] = last_entity
-
-        df_this_table["entity"] = col_entity
-        table_def["index_cols"].append("entity")
-
-    # make a wide table
-    df_this_table = make_wide_table(
-        df_this_table,
-        table_def["wide_keyword"],
-        table_def["col_wide_kwd"],
-        table_def["index_cols"],
-    )
-
-    if "drop_rows" in table_def.keys():
-        df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
-
-    # reset row index
-    df_this_table = df_this_table.reset_index(drop=False)
-
-    # add entity
-    if "entity" in table_def.keys():
-        df_this_table["entity"] = table_def["entity"]
-
-    # add unit
-    df_this_table["unit"] = table_def["unit"]
-
-    df_this_table = df_this_table.rename(
-        {table_def["index_cols"][0]: "orig_cat_name"}, axis=1
-    )
-
-    # print(table_def["index_cols"][0])
-    # print(df_this_table.columns.values)
-
-    # make a copy of the categories row
-    df_this_table["category"] = df_this_table["orig_cat_name"]
-
-    # replace cat names by codes in col "category"
-    # first the manual replacements
-    df_this_table["category"] = df_this_table["category"].replace(
-        table_def["cat_codes_manual"]
-    )
-
-    # then the regex replacements
-    df_this_table["category"] = df_this_table["category"].str.replace(
-        cat_code_regexp, repl, regex=True
-    )
-
-    ### convert to PRIMAP2 IF
-    # remove ','
-    time_format = "%Y"
-    time_columns = [
-        col
-        for col in df_this_table.columns.to_numpy()
-        if matches_time_format(col, time_format)
-    ]
-
-    for col in time_columns:
-        df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
-            ",", "", regex=False
-        )
-
-    # drop orig_cat_name as it's not unique per category
-    df_this_table = df_this_table.drop(columns="orig_cat_name")
-
-    # coords_defaults_this_table = coords_defaults.copy()
-    # coords_defaults_this_table["unit"] = table_def["unit"]
-    df_this_table_if = pm2.pm2io.convert_wide_dataframe_if(
-        df_this_table,
-        coords_cols=coords_cols,
-        add_coords_cols=add_coords_cols,
-        coords_defaults=coords_defaults,
-        coords_terminologies=coords_terminologies,
-        coords_value_mapping=coords_value_mapping,
-        # coords_value_filling=coords_value_filling,
-        # filter_remove=filter_remove,
-        # filter_keep=filter_keep,
-        meta_data=meta_data,
-    )
-
-    this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
-
-    if data_pm2 is None:
-        data_pm2 = this_table_pm2
-    else:
-        data_pm2 = data_pm2.pr.merge(this_table_pm2)
-
-# convert back to IF to have units in the fixed format
-data_if = data_pm2.pr.to_interchange_format()
-
-# ###
-# save data
-# ###
-# data in original categories
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + coords_terminologies["category"]), data_if
-)
-encoding = {var: compression for var in data_pm2.data_vars}
-data_pm2.pr.to_netcdf(
-    (output_folder / (output_filename + coords_terminologies["category"])).with_suffix(
-        ".nc"
-    ),
-    encoding=encoding,
-)
-
-
-# ###
-# convert to IPCC2006 categories
-# ###
-data_proc_pm2 = data_pm2.copy(deep=True)
-
-
-country_processing = {
-    "basket_copy": basket_copy,
-}
-
-data_proc_pm2 = process_data_for_country(
-    data_proc_pm2,
-    entities_to_ignore=[],
-    gas_baskets=gas_baskets,
-    processing_info_country=country_processing,
-    cat_terminology_out=terminology_proc,
-    category_conversion=cat_conversion,
-)
-
-# convert to IF
-data_proc_if = data_proc_pm2.pr.to_interchange_format()
-
-# ###
-# save data
-# ###
-# data in 2006 categories
-pm2.pm2io.write_interchange_format(
-    output_folder / (output_filename + "IPCC2006_PRIMAP"), data_proc_if
-)
-encoding = {var: compression for var in data_proc_pm2.data_vars}
-data_proc_pm2.pr.to_netcdf(
-    (output_folder / (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
-    encoding=encoding,
-)

+ 266 - 0
src/unfccc_ghg_data/unfccc_reader/Taiwan/read_TWN_2023_Inventory_from_pdf.py

@@ -0,0 +1,266 @@
+"""
+Read data from Taiwan's 2023 national inventory
+
+Data are read from the english summary pdf
+"""
+
+import copy
+
+import camelot
+import pandas as pd
+import primap2 as pm2
+from primap2.pm2io._data_reading import matches_time_format
+
+from unfccc_ghg_data.helper import (
+    compression,
+    downloaded_data_path,
+    extracted_data_path,
+    gas_baskets,
+    make_wide_table,
+    process_data_for_country,
+)
+
+from .config_twn_nir2022 import fix_rows
+from .config_twn_nir2023 import (
+    add_coords_cols,
+    basket_copy,
+    cat_code_regexp,
+    cat_conversion,
+    coords_cols,
+    coords_defaults,
+    coords_terminologies,
+    coords_value_mapping,
+    meta_data,
+    page_defs,
+    table_defs,
+    terminology_proc,
+)
+
+if __name__ == "__main__":
+    # ###
+    # configuration
+    # ###
+    input_folder = downloaded_data_path / "non-UNFCCC" / "Taiwan" / "2023_NIR"
+    output_folder = extracted_data_path / "non-UNFCCC" / "Taiwan"
+    if not output_folder.exists():
+        output_folder.mkdir()
+
+    output_filename = "TWN_inventory_2023_"
+    inventory_file = "2023_NIR_executive_summary_english.pdf"
+
+    def repl(m):  # noqa: D103
+        return m.group("UNFCCC_GHG_data")
+
+    # ###
+    # read the tables from pdf
+    # ###
+
+    all_tables = []
+    for page in page_defs:
+        print(f"Reading from page {page}")
+        new_tables = camelot.read_pdf(
+            str(input_folder / inventory_file),
+            pages=page,
+            **page_defs[page],
+        )
+        for table in new_tables:
+            all_tables.append(table.df)
+
+    # ###
+    # convert tables to primap2 format
+    # ###
+    data_pm2 = None
+    for table_name in table_defs.keys():
+        print(f"Working on table: {table_name}")
+
+        table_def = copy.deepcopy(table_defs[table_name])
+        # combine all raw tables
+        df_this_table = all_tables[table_def["tables"][0]].copy(deep=True)
+        if len(table_def["tables"]) > 1:
+            for table in table_def["tables"][1:]:
+                df_this_table = pd.concat(
+                    [df_this_table, all_tables[table]], axis=0, join="outer"
+                )
+
+        # fix for table ES3.6
+        if table_name == "ES3.6":
+            col_idx = df_this_table[0] == "Total CO Emission"
+            df_this_table.loc[col_idx, 1:] = ""
+            df_this_table.loc[col_idx, 0] = "Total CO2 Emission"
+
+        df_this_table = df_this_table.reset_index(drop=True)
+
+        # fix categories if necessary
+        if "fix_cats" in table_def.keys():
+            for col in table_def["fix_cats"]:
+                df_this_table[col] = df_this_table[col].replace(
+                    table_def["fix_cats"][col]
+                )
+
+        # fix rows
+        for col in table_def["rows_to_fix"].keys():
+            for n_rows in table_def["rows_to_fix"][col].keys():
+                print(f"Fixing {col}, {n_rows}")
+                # replace line breaks, long hyphens, double, and triple spaces in
+                # category names
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "\n", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "   ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "  ", " "
+                )
+                df_this_table.iloc[:, 0] = df_this_table.iloc[:, 0].str.replace(
+                    "-", "-"
+                )
+                df_this_table = fix_rows(
+                    df_this_table, table_def["rows_to_fix"][col][n_rows], col, n_rows
+                )
+
+        # split by entity
+        if "gas_splitting" in table_def.keys():
+            col_entity = [""] * len(df_this_table)
+            last_entity = ""
+            for i in range(0, len(df_this_table)):
+                current_header = df_this_table[table_def["col_wide_kwd"]].iloc[i]
+                if current_header in table_def["gas_splitting"].keys():
+                    last_entity = table_def["gas_splitting"][current_header]
+                col_entity[i] = last_entity
+
+            df_this_table["entity"] = col_entity
+            table_def["index_cols"].append("entity")
+
+        # make a wide table
+        df_this_table = make_wide_table(
+            df_this_table,
+            table_def["wide_keyword"],
+            table_def["col_wide_kwd"],
+            table_def["index_cols"],
+        )
+
+        if "drop_rows" in table_def.keys():
+            df_this_table = df_this_table.drop(table_def["drop_rows"], axis=0)
+
+        # reset row index
+        df_this_table = df_this_table.reset_index(drop=False)
+
+        # add entity
+        if "entity" in table_def.keys():
+            df_this_table["entity"] = table_def["entity"]
+
+        # add unit
+        df_this_table["unit"] = table_def["unit"]
+
+        df_this_table = df_this_table.rename(
+            {table_def["index_cols"][0]: "orig_cat_name"}, axis=1
+        )
+
+        # print(table_def["index_cols"][0])
+        # print(df_this_table.columns.values)
+
+        # make a copy of the categories row
+        df_this_table["category"] = df_this_table["orig_cat_name"]
+
+        # replace cat names by codes in col "category"
+        # first the manual replacements
+        df_this_table["category"] = df_this_table["category"].replace(
+            table_def["cat_codes_manual"]
+        )
+
+        # then the regex replacements
+        df_this_table["category"] = df_this_table["category"].str.replace(
+            cat_code_regexp, repl, regex=True
+        )
+
+        ### convert to PRIMAP2 IF
+        # remove ','
+        time_format = "%Y"
+        time_columns = [
+            col
+            for col in df_this_table.columns.to_numpy()
+            if matches_time_format(col, time_format)
+        ]
+
+        for col in time_columns:
+            df_this_table.loc[:, col] = df_this_table.loc[:, col].str.replace(
+                ",", "", regex=False
+            )
+
+        # drop orig_cat_name as it's not unique per category
+        df_this_table = df_this_table.drop(columns="orig_cat_name")
+
+        # coords_defaults_this_table = coords_defaults.copy()
+        # coords_defaults_this_table["unit"] = table_def["unit"]
+        df_this_table_if = pm2.pm2io.convert_wide_dataframe_if(
+            df_this_table,
+            coords_cols=coords_cols,
+            add_coords_cols=add_coords_cols,
+            coords_defaults=coords_defaults,
+            coords_terminologies=coords_terminologies,
+            coords_value_mapping=coords_value_mapping,
+            # coords_value_filling=coords_value_filling,
+            # filter_remove=filter_remove,
+            # filter_keep=filter_keep,
+            meta_data=meta_data,
+        )
+
+        this_table_pm2 = pm2.pm2io.from_interchange_format(df_this_table_if)
+
+        if data_pm2 is None:
+            data_pm2 = this_table_pm2
+        else:
+            data_pm2 = data_pm2.pr.merge(this_table_pm2)
+
+    # convert back to IF to have units in the fixed format
+    data_if = data_pm2.pr.to_interchange_format()
+
+    # ###
+    # save data
+    # ###
+    # data in original categories
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + coords_terminologies["category"]), data_if
+    )
+    encoding = {var: compression for var in data_pm2.data_vars}
+    data_pm2.pr.to_netcdf(
+        (
+            output_folder / (output_filename + coords_terminologies["category"])
+        ).with_suffix(".nc"),
+        encoding=encoding,
+    )
+
+    # ###
+    # convert to IPCC2006 categories
+    # ###
+    data_proc_pm2 = data_pm2.copy(deep=True)
+
+    country_processing = {
+        "basket_copy": basket_copy,
+    }
+
+    data_proc_pm2 = process_data_for_country(
+        data_proc_pm2,
+        entities_to_ignore=[],
+        gas_baskets=gas_baskets,
+        processing_info_country=country_processing,
+        cat_terminology_out=terminology_proc,
+        category_conversion=cat_conversion,
+    )
+
+    # convert to IF
+    data_proc_if = data_proc_pm2.pr.to_interchange_format()
+
+    # ###
+    # save data
+    # ###
+    # data in 2006 categories
+    pm2.pm2io.write_interchange_format(
+        output_folder / (output_filename + "IPCC2006_PRIMAP"), data_proc_if
+    )
+    encoding = {var: compression for var in data_proc_pm2.data_vars}
+    data_proc_pm2.pr.to_netcdf(
+        (output_folder / (output_filename + "IPCC2006_PRIMAP")).with_suffix(".nc"),
+        encoding=encoding,
+    )

+ 8 - 11
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -8,7 +8,14 @@ Data are read from pdf using camelot
 import camelot
 import pandas as pd
 import primap2 as pm2
-from config_tha_bur3 import (
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
+
+from .config_tha_bur3 import (
     cat_conversion,
     coords_cols,
     coords_cols_indirect,
@@ -30,12 +37,6 @@ from config_tha_bur3 import (
     trend_conf,
 )
 
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    process_data_for_country,
-)
-
 if __name__ == "__main__":
     # ###
     # configuration
@@ -152,8 +153,6 @@ if __name__ == "__main__":
     )
 
     df_main_sector_ts = tables_main_sector_ts[0].df.iloc[2:]
-    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_main_sector_ts.columns = [trend_conf["header"], trend_conf["unit"]]
 
     df_main_sector_ts = df_main_sector_ts.transpose()
@@ -207,8 +206,6 @@ if __name__ == "__main__":
     )
 
     df_indirect = tables_indirect[0].df.iloc[2:]
-    # df_header = pd.DataFrame([header_main_sector_ts, unit_main_sector_ts])
-    # df_main_sector_ts = pd.concat([df_header, df_main_sector_ts], axis=0, join='outer')
     df_indirect.columns = [ind_conf["header"], ind_conf["unit"]]
 
     df_indirect = df_indirect.transpose()

+ 8 - 7
src/unfccc_ghg_data/unfccc_reader/Thailand/read_THA_BUR4_from_pdf.py

@@ -21,7 +21,14 @@ tables
 
 import pandas as pd
 import primap2 as pm2
-from config_tha_bur4 import (
+
+from unfccc_ghg_data.helper import (
+    downloaded_data_path,
+    extracted_data_path,
+    process_data_for_country,
+)
+
+from .config_tha_bur4 import (
     cat_codes_manual_main_sector_ts,
     cat_conversion,
     coords_cols,
@@ -42,12 +49,6 @@ from config_tha_bur4 import (
     terminology_proc,
 )
 
-from unfccc_ghg_data.helper import (
-    downloaded_data_path,
-    extracted_data_path,
-    process_data_for_country,
-)
-
 if __name__ == "__main__":
     # ###
     # configuration

+ 0 - 11
tests/unit/test_operations.py

@@ -1,11 +0,0 @@
-"""
-Test operations
-
-This module is just there to help with doc building etc. on
-project creation. You will probably delete it early in the project.
-"""
-from unfccc_ghg_data.operations import add_two
-
-
-def test_add_two():
-    assert add_two(3, 4) == 7