Browse Source

Merge branch 'master' of gin.hemio.de:/jguetschow/UNFCCC_non-AnnexI_data

Johannes Gütschow 1 year ago
parent
commit
cf0d3138d3
100 changed files with 4520 additions and 665 deletions
  1. 4 2
      .gitignore
  2. 1 0
      DI_reading.dia
  3. 2 12
      UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
  4. 10 26
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
  5. 1 3
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
  6. 7 16
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
  7. 4 1
      UNFCCC_GHG_data/UNFCCC_CRF_reader/__init__.py
  8. 0 20
      UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
  9. 474 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_config.py
  10. 1407 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
  11. 23 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
  12. 26 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country.py
  13. 22 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py
  14. 27 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country.py
  15. 17 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py
  16. 19 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group.py
  17. 19 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py
  18. 13 0
      UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
  19. 5 0
      UNFCCC_GHG_data/UNFCCC_downloader/__init__.py
  20. 13 12
      UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
  21. 14 11
      UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
  22. 9 10
      UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
  23. 2 3
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
  24. 2 3
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
  25. 4 4
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
  26. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
  27. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
  28. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
  29. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
  30. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
  31. 2 8
      UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
  32. 2 9
      UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
  33. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
  34. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
  35. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
  36. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/__init__.py
  37. 3 437
      UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
  38. 3 9
      UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
  39. 6 1
      UNFCCC_GHG_data/__init__.py
  40. 27 0
      UNFCCC_GHG_data/helper/__init__.py
  41. 2 2
      UNFCCC_GHG_data/helper/country_info.py
  42. 49 0
      UNFCCC_GHG_data/helper/definitions.py
  43. 1 1
      UNFCCC_GHG_data/helper/folder_mapping.py
  44. 510 0
      UNFCCC_GHG_data/helper/functions.py
  45. 1 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.csv
  46. 1 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.nc
  47. 1 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.yaml
  48. 1 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv
  49. 1 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc
  50. 29 0
      datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml
  51. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.csv
  52. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.nc
  53. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.yaml
  54. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv
  55. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc
  56. 40 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml
  57. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv
  58. 1 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.nc
  59. 29 0
      datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.yaml
  60. 6 0
      datasets/UNFCCC/no_updates_until.txt
  61. 94 9
      dodo.py
  62. 1 0
      downloaded_data/UNFCCC/00_new_downloads_BUR-2023-05-25.csv
  63. 1 0
      downloaded_data/UNFCCC/00_new_downloads_CRF2023-2023-05-25.csv
  64. 1 0
      downloaded_data/UNFCCC/00_new_downloads_NC-2023-05-26.csv
  65. 1 0
      downloaded_data/UNFCCC/Bosnia_and_Herzegovina/BUR3/TBUR_BiH_Oct__2022_ENG.pdf
  66. 1 0
      downloaded_data/UNFCCC/Bosnia_and_Herzegovina/NC4/FNC_BiH_ENG_fin.pdf
  67. 1 0
      downloaded_data/UNFCCC/Micronesia_(Federated_States_of)/BUR1/NC3_BUR1_MICRONESIA_UNFCCC.pdf
  68. 1 0
      downloaded_data/UNFCCC/Nicaragua/NC4/4CN-Nicaragua.pdf
  69. 1 0
      downloaded_data/UNFCCC/Niger/BUR1/92876103_Niger-BUR1-1-PREMIER_RAPPORT_BIENNAL_ACTUALISE_DU_NIGER.pdf
  70. 1 0
      downloaded_data/UNFCCC/Niger/BUR1/RIN_BUR-2022_VF_11-07-2022_FINAL.pdf
  71. 1413 0
      downloaded_data/UNFCCC/North_Macedonia/NC4/EN%2C_IV_NCCC.pdf
  72. 1 0
      downloaded_data/UNFCCC/North_Macedonia/NIR/IV_Inventory_report.pdf
  73. 1 0
      downloaded_data/UNFCCC/Somalia/BUR1/Somalia_First_BUR_report_2022.pdf
  74. 1 0
      downloaded_data/UNFCCC/Suriname/NC3/SURINAME_NC3_2023_FINAL.pdf
  75. 1 1
      downloaded_data/UNFCCC/submissions-bur.csv
  76. 1 1
      downloaded_data/UNFCCC/submissions-nc.csv
  77. 1 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.csv
  78. 1 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.nc
  79. 1 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.yaml
  80. 1 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv
  81. 1 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc
  82. 31 0
      extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml
  83. 1 0
      extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.csv
  84. 1 0
      extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.nc
  85. 1 0
      extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.yaml
  86. 1 0
      extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv
  87. 1 0
      extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc
  88. 31 0
      extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml
  89. 1 0
      extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv
  90. 1 0
      extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc
  91. 31 0
      extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml
  92. 1 0
      extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.csv
  93. 1 0
      extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.nc
  94. 1 0
      extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.yaml
  95. 1 0
      extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.csv
  96. 1 0
      extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.nc
  97. 1 0
      extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.yaml
  98. 1 0
      extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv
  99. 1 0
      extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc
  100. 31 0
      extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml

+ 4 - 2
.gitignore

@@ -6,6 +6,8 @@ __pycache__
 .doit.db
 .doit.db
 log
 log
 UNFCCC_GHG_data/datasets
 UNFCCC_GHG_data/datasets
-UNFCCC_GHG_data/UNFCCC_DI_reader
-
+UNFCCC_GHG_data/UNFCCC_DI_reader/test_UNFCCC_DI_reader.ipynb
+UNFCCC_GHG_data/UNFCCC_DI_reader/.ipynb_checkpoints/
+*.autosave
+#UNFCCC_GHG_data/UNFCCC_DI_reader
 
 

+ 1 - 0
DI_reading.dia

@@ -0,0 +1 @@
+.git/annex/objects/75/Pv/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia

+ 2 - 12
UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py

@@ -9,20 +9,10 @@ submission are available in the downloaded data folder.
 # TODO: integrate into doit
 # TODO: integrate into doit
 
 
 import argparse
 import argparse
-import sys
 import primap2 as pm2
 import primap2 as pm2
 from pathlib import Path
 from pathlib import Path
 from datetime import date
 from datetime import date
-
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-#log_path = root_path / "log"
-code_path = root_path / "UNFCCC_GHG_data"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
-dataset_path = root_path / "datasets" / "UNFCCC"
-
-#sys.path.append(code_path.name)
+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
 
 
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
@@ -81,7 +71,7 @@ for country in all_crf_countries:
 today = date.today()
 today = date.today()
 
 
 compression = dict(zlib=True, complevel=9)
 compression = dict(zlib=True, complevel=9)
-output_folder = dataset_path / f"CRF{submission_year}"
+output_folder = dataset_path_UNFCCC / f"CRF{submission_year}"
 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
 
 
 if not output_folder.exists():
 if not output_folder.exists():

+ 10 - 26
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py

@@ -8,9 +8,7 @@ import re
 import json
 import json
 import numpy as np
 import numpy as np
 import pandas as pd
 import pandas as pd
-import xarray as xr
 import primap2 as pm2
 import primap2 as pm2
-import pycountry
 from pathlib import Path
 from pathlib import Path
 from treelib import Tree
 from treelib import Tree
 from operator import itemgetter
 from operator import itemgetter
@@ -18,8 +16,8 @@ from collections import Counter
 from typing import Dict, List, Optional, Tuple, Union
 from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from . import crf_specifications as crf
 from . import crf_specifications as crf
-from .util import downloaded_data_path, NoCRFFilesError, custom_country_mapping
-
+from .util import NoCRFFilesError
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 ### reading functions
 ### reading functions
 def convert_crf_table_to_pm2if(
 def convert_crf_table_to_pm2if(
@@ -568,7 +566,7 @@ def get_crf_files(
     # we should only have files for one country and submission in the folder. But the
     # we should only have files for one country and submission in the folder. But the
     # function can also be used on a given folder and then the filter is useful.
     # function can also be used on a given folder and then the filter is useful.
     if folder is None:
     if folder is None:
-        data_folder = downloaded_data_path
+        data_folder = downloaded_data_path_UNFCCC
         submission_folder = f"CRF{submission_year}"
         submission_folder = f"CRF{submission_year}"
 
 
         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
@@ -935,7 +933,7 @@ def get_latest_date_for_country(
         str: string with date
         str: string with date
     """
     """
 
 
-    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
+    with open(downloaded_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
         folder_mapping = json.load(mapping_file)
         folder_mapping = json.load(mapping_file)
 
 
     if country_code in folder_mapping:
     if country_code in folder_mapping:
@@ -946,12 +944,12 @@ def get_latest_date_for_country(
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
             submission_date = find_latest_date(get_submission_dates(
             submission_date = find_latest_date(get_submission_dates(
-                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
+                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
         else:
         else:
             dates = []
             dates = []
             for folder in country_folders:
             for folder in country_folders:
                 dates = dates + get_submission_dates(
                 dates = dates + get_submission_dates(
-                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}", file_filter)
             submission_date = find_latest_date(dates)
             submission_date = find_latest_date(dates)
     else:
     else:
         raise ValueError(f"No data folder found for country {country_code}. "
         raise ValueError(f"No data folder found for country {country_code}. "
@@ -1022,7 +1020,7 @@ def get_submission_parties(
                          f"the function's purpose is to return available parties.")
                          f"the function's purpose is to return available parties.")
 
 
     if folder.exists():
     if folder.exists():
-        files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
+        files = filter_filenames(list(folder.glob("*.xlsx")), **file_filter)
     else:
     else:
         raise ValueError(f"Folder {folder} does not exist")
         raise ValueError(f"Folder {folder} does not exist")
 
 
@@ -1034,6 +1032,7 @@ def get_submission_parties(
 
 
 def find_latest_date(
 def find_latest_date(
         dates: List[str],
         dates: List[str],
+        date_format: str='%d%m%Y',
 )-> str:
 )-> str:
     """
     """
     Returns the latest date in a list of dates as str in the format
     Returns the latest date in a list of dates as str in the format
@@ -1050,26 +1049,11 @@ def find_latest_date(
     """
     """
 
 
     if len(dates) > 0:
     if len(dates) > 0:
-        dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
+        dates_datetime = [[date, datetime.strptime(date, date_format)] for date in
+                          dates]
         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
     else:
     else:
         raise ValueError(f"Passed list of dates is empty")
         raise ValueError(f"Passed list of dates is empty")
 
 
     return dates_datetime[-1][0]
     return dates_datetime[-1][0]
 
 
-
-def get_country_name(
-        country_code: str,
-) -> str:
-    """get country name from UNFCCC_GHG_data """
-    if country_code in custom_country_mapping:
-        country_name = custom_country_mapping[country_code]
-    else:
-        try:
-            country = pycountry.countries.get(alpha_3=country_code)
-            country_name = country.name
-        except:
-            raise ValueError(f"Country UNFCCC_GHG_data {country_code} can not be mapped to "
-                             f"any country")
-
-    return country_name

+ 1 - 3
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py

@@ -13,11 +13,9 @@ from typing import List, Optional
 from pathlib import Path
 from pathlib import Path
 from datetime import date
 from datetime import date
 
 
-
 from .util import all_crf_countries
 from .util import all_crf_countries
-from .util import log_path
+from UNFCCC_GHG_data.helper import log_path, get_country_name
 from . import crf_specifications as crf
 from . import crf_specifications as crf
-from .UNFCCC_CRF_reader_core import get_country_name
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 
 

+ 7 - 16
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py

@@ -20,18 +20,16 @@ from .UNFCCC_CRF_reader_core import read_crf_table
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
 from .UNFCCC_CRF_reader_core import get_crf_files
 from .UNFCCC_CRF_reader_core import get_crf_files
-from .UNFCCC_CRF_reader_core import get_country_name
 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
 from .UNFCCC_CRF_reader_devel import save_last_row_info
 from .UNFCCC_CRF_reader_devel import save_last_row_info
 
 
-from .util import code_path, log_path, \
-    custom_country_mapping, extracted_data_path, root_path, \
-    all_crf_countries, NoCRFFilesError
+from UNFCCC_GHG_data.helper import code_path, log_path, root_path
+from UNFCCC_GHG_data.helper import custom_country_mapping, extracted_data_path_UNFCCC
+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
+from .util import all_crf_countries, NoCRFFilesError
 
 
 #import sys
 #import sys
 #sys.path.append(code_path.name)
 #sys.path.append(code_path.name)
-from ..UNFCCC_reader import get_country_code
-
 
 
 # functions:
 # functions:
 # * testing fucntions
 # * testing fucntions
@@ -42,8 +40,6 @@ from ..UNFCCC_reader import get_country_code
 
 
 # TODO: add function to read several / all countries
 # TODO: add function to read several / all countries
 
 
-
-
 # general approach:
 # general approach:
 # main UNFCCC_GHG_data in a function that reads on table from one file.
 # main UNFCCC_GHG_data in a function that reads on table from one file.
 # return raw pandas DF for use in different functions
 # return raw pandas DF for use in different functions
@@ -188,7 +184,7 @@ def read_crf_for_country(
 
 
         if save_data:
         if save_data:
             compression = dict(zlib=True, complevel=9)
             compression = dict(zlib=True, complevel=9)
-            output_folder = extracted_data_path / country_name.replace(" ", "_")
+            output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
 
 
             if not output_folder.exists():
             if not output_folder.exists():
@@ -415,11 +411,6 @@ def read_new_crf_for_year_datalad(
     )
     )
 
 
 
 
-# function to read all available data (or list of countries?)
-# make sure it works when not all countries have submitted data
-# give option to only read new data (no output yet), but also option to
-# read all data, e.g. when specifications have changed
-
 def get_input_and_output_files_for_country(
 def get_input_and_output_files_for_country(
         country: str,
         country: str,
         submission_year: int,
         submission_year: int,
@@ -481,7 +472,7 @@ def get_input_and_output_files_for_country(
     country_info["input"] = input_files
     country_info["input"] = input_files
 
 
     # get output file
     # get output file
-    output_folder = extracted_data_path / country_name.replace(" ", "_")
+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
                                     f"_{submission_date}.{suffix}" for suffix
                                     f"_{submission_date}.{suffix}" for suffix
                     in ['yaml', 'csv', 'nc']]
                     in ['yaml', 'csv', 'nc']]
@@ -510,7 +501,7 @@ def submission_has_been_read(
     """
     """
     Check if a CRF submission has already been read
     Check if a CRF submission has already been read
     """
     """
-    output_folder = extracted_data_path / country_name.replace(" ", "_")
+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
     if output_folder.exists():
     if output_folder.exists():
         existing_files = output_folder.glob(f"{output_filename}.*")
         existing_files = output_folder.glob(f"{output_filename}.*")

+ 4 - 1
UNFCCC_GHG_data/UNFCCC_CRF_reader/__init__.py

@@ -6,5 +6,8 @@ CRF reader module
 from . import crf_specifications
 from . import crf_specifications
 from .UNFCCC_CRF_reader_prod import read_crf_for_country, read_crf_for_country_datalad
 from .UNFCCC_CRF_reader_prod import read_crf_for_country, read_crf_for_country_datalad
 
 
-__all__ = ["crf_specifications", "read_crf_for_country", "read_crf_for_country_datalad"]
+__all__ = ["crf_specifications",
+           "read_crf_for_country",
+           "read_crf_for_country_datalad",
+           ]
 
 

+ 0 - 20
UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py

@@ -1,23 +1,3 @@
-from pathlib import Path
-
-# 4 for use from nbs, fix
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-log_path = root_path / "log"
-code_path = root_path / "UNFCCC_GHG_data"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
-
-# TODO: move this to a more general location as we can't import it
-# to get_submissions_info
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
-}
 
 
 all_crf_countries = [
 all_crf_countries = [
     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',

+ 474 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_config.py

@@ -0,0 +1,474 @@
+di_query_filters = [
+    'classifications', 'measures', 'gases',
+]
+# category, party are extra
+# measure is preprocessed to find ids
+
+# the activity data and emissions factors have a structure that is incompatible
+# with PRIMAP2.
+# To read it into a primap2 dataframe the information in classification / measure
+# has to be put into "entity" which is currently always "No gas". I's possible,
+# but takes some time, so I have omitted it here
+filter_activity_factors = {
+    "entity": {"gas": ["No gas"]},
+    "unit": {"unit": [
+        'no unit', 'kg/TJ', 't/TJ', '%', 'kg/t',
+        'kg/kt', 't/t', 'kg/head/year', 'kg N2O/kg N handled', 'kg N2O/kg N',
+        'kg N2O-N/kg N handled', 'g/m^2', 'kg N2O-N/kg N', 'kg N2O-N/ha', 'kg/t dm',
+        't CO2-C/t', 't/unit', 't C/ha', 'kg CH4/ha', 'kg CO2/ha',
+        'g/kg', 'kg/kg DC',
+    ]
+    },
+}
+
+# regular expression to match category code in category label
+cat_code_regexp = r'(?P<code>^(([0-9][A-Za-z0-9\.]{0,10}[0-9A-Za-z]))|([0-9]))[' \
+                  r'\s\.].*'
+
+# PRIMAP2 interchange format config
+di_to_pm2if_template_nai = {
+    "coords_cols": {
+        "category": "category",
+        "entity": "gas",
+        "unit": "unit",
+        "area": "party",
+        "sec_cats__class": "classification",
+        "sec_cats__measure": "measure",
+        "data": "numberValue",
+        "time": "year",
+    },
+    # to store the original category name as well as the one mapped to IPCC categories
+    "add_coords_cols": {
+        "orig_cat_name": ["category_copy", "category"],
+    },
+    # terminologies for different coordinates
+    "coords_terminologies": {
+        "area": "ISO3",
+        "scenario": "Access_Date",
+        "category": "BURDI",
+    },
+    # default values for coordinates
+    "coords_defaults": {
+        "provenance": "measured",
+        "source": "UNFCCC",
+    },
+    # mapping of values e.g. gases to the primap2 format
+    "coords_value_mapping": {
+        "entity": {
+            "Aggregate GHGs (SARGWP100)": "KYOTOGHG (SARGWP100)",
+            "Aggregate F-gases (SARGWP100)": "FGASES (SARGWP100)",
+            "HFCs (SARGWP100)": "HFCS (SARGWP100)",
+            "PFCs (SARGWP100)": "PFCS (SARGWP100)",
+            #"SF6 (SARGWP100)": "SF6 (SARGWP100)",
+            #"CH4 (SARGWP100)": "CH4 (SARGWP100)",
+            "CO2 (SARGWP100)": "CO2",
+            #"N2O (SARGWP100)": "N2O (SARGWP100)",
+            #"Unspecified mix of HFCs and PFCs (SARGWP100)":
+            #    "UnspMixOfHFCsPFCs (SARGWP100)",
+            "Unspecified mix of HFCs (SARGWP100)": "UnspMixOfHFCs (SARGWP100)",
+            "Unspecified mix of PFCs (SARGWP100)": "UnspMixOfPFCs (SARGWP100)",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
+            "HFC-134": "HFC134",
+            "HFC-134a": "HFC134a",
+            "HFC-143": "HFC143",
+            "HFC-143a": "HFC143a",
+            "HFC-152": "HFC152",
+            "HFC-152a": "HFC152a",
+            "HFC-161": "HFC161",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236ea": "HFC236ea",
+            "HFC-236cb": "HFC236cb",
+            "HFC-236fa": "HFC236fa",
+            "HFC-245ca": "HFC245ca",
+            "HFC-245fa": "HFC245fa",
+            "HFC-365mfc": "HFC365mfc",
+            "c-C4F8": "cC4F8",
+            "c-C3F6": "cC3F6",
+        },
+        "unit": "PRIMAP1",
+        "category": {
+            # NAI
+            "Total GHG emissions excluding LULUCF/LUCF": "15163",
+            "Total GHG emissions including LULUCF/LUCF": "24540",
+            "International Bunkers": "14637",
+            "Marine": "14423",
+            "Aviation": "14424",
+            "CO₂ Emissions from Biomass": "14638",
+        }
+    },
+    # fill missing data from other columns (not needed here)
+    "coords_value_filling": {
+    },
+    # remove data based on filters
+    "filter_remove": {
+    },
+    # keep only the data defined in the filters
+    "filter_keep": {
+    },
+    # define meta data
+    "meta_data": {
+        "references": "https://di.unfccc.int",
+        "title": "XXXX", # to set per country
+        "comment": "Data read from the UNFCCC DI flexible query interface using the API.",
+        "rights": "",
+        "contact": "mail@johannes-guetschow.de",
+        "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
+    },
+    # time format used in the input data
+    "time_format": "%Y",
+}
+
+di_to_pm2if_template_ai = {
+    "coords_cols": {
+        "category": "category",
+        "entity": "gas",
+        "unit": "unit",
+        "area": "party",
+        "sec_cats__class": "classification",
+        "sec_cats__measure": "measure",
+        "data": "numberValue",
+        "time": "year",
+    },
+    # to store the original category name as well as the one mapped to IPCC categories
+    "add_coords_cols": {
+        #"orig_cat_name": ["category_copy", "category"],
+    },
+    # terminologies for different coordinates
+    "coords_terminologies": {
+        "area": "ISO3",
+        "scenario": "Access_Date",
+        "category": "CRFDI",
+    },
+    # default values for coordinates
+    "coords_defaults": {
+        "provenance": "measured",
+        "source": "UNFCCC",
+    },
+    # mapping of values e.g. gases to the primap2 format
+    "coords_value_mapping": {
+        "entity": {
+            "Aggregate F-gases (AR4GWP100)": "FGASES (AR4GWP100)",
+            "Aggregate GHGs (AR4GWP100)": "KYOTOGHG (AR4GWP100)",
+            "HFCs (AR4GWP100)": "HFCS (AR4GWP100)",
+            "PFCs (AR4GWP100)": "PFCS (AR4GWP100)",
+            "Unspecified mix of HFCs and PFCs (AR4GWP100)":
+                "UnspMixOfHFCsPFCs (AR4GWP100)",
+            #"Unspecified mix of HFCs and PFCs":
+            #    "UnspMixOfHFCsPFCs", # this is problematic, mixes should use CO2eq
+            # with GWP
+            "Unspecified mix of HFCs (AR4GWP100)": "UnspMixOfHFCs (AR4GWP100)",
+            "Unspecified mix of PFCs (AR4GWP100)": "UnspMixOfPFCs (AR4GWP100)",
+            "HFC-23": "HFC23",
+            "HFC-32": "HFC32",
+            "HFC-41": "HFC41",
+            "HFC-43-10mee": "HFC4310mee",
+            "HFC-125": "HFC125",
+            "HFC-134": "HFC134",
+            "HFC-134a": "HFC134a",
+            "HFC-143": "HFC143",
+            "HFC-143a": "HFC143a",
+            "HFC-152": "HFC152",
+            "HFC-152a": "HFC152a",
+            "HFC-161": "HFC161",
+            "HFC-227ea": "HFC227ea",
+            "HFC-236ea": "HFC236ea",
+            "HFC-236cb": "HFC236cb",
+            "HFC-236fa": "HFC236fa",
+            "HFC-245ca": "HFC245ca",
+            "HFC-245fa": "HFC245fa",
+            "HFC-365mfc": "HFC365mfc",
+            "c-C4F8": "cC4F8",
+            "c-C3F6": "cC3F6",
+        },
+        "unit": "PRIMAP1",
+        "category": {
+            'Annual Change in Total Long-term C Storage': "11024",
+            'Annual Change in Total Long-term C Storage in HWP Waste': "11025",
+            'HWP in SWDS': "11036",
+            'International Aviation': "10357",
+            'International Navigation': "8828",
+            'Long-term Storage of C in Waste Disposal Sites': "temp",
+            'CO₂ Emissions from Biomass': "8270",
+            'International Bunkers': "8564",
+            'Multilateral Operations': "8987",
+            'Total Amount Captured for Storage': "11030",
+            'Total Amount of CO₂ Injected at Storage Sites': "11033",
+            'Total Amount of Exports for Storage': "11032",
+            'Total Amount of Imports for Storage': "11031",
+            'Total GHG emissions with LULUCF': "8677",
+            'Total GHG emissions with LULUCF including indirect CO₂': "10480",
+            'Total GHG emissions without LULUCF': "10464",
+            'Total GHG emissions without LULUCF including indirect CO₂': "10479",
+            'Total Leakage from Transport, Injection and Storage': "11034",
+            'Waste Incineration with Energy Recovery included as Biomass': "11027",
+            'Waste Incineration with Energy Recovery included as Fossil Fuels':
+                "11028",
+        }
+    },
+    # fill missing data from other columns (not needed here)
+    "coords_value_filling": {
+    },
+    # remove data based on filters
+    "filter_remove": {
+        # some upsecified mixes not reported in CO2eq have tonbe removed
+        "entity_wrong_unit": {
+            "gas": ["Unspecified mix of HFCs and PFCs"]
+        },
+        # remove data that is not for a gas (partly it currently can't be read and
+        # partly because the dataset is too large because of the many dimensions)
+        "entity_no_gas": {
+            "gas": ["No gas"]
+        },
+    },
+    # keep only the data defined in the filters
+    "filter_keep": {
+        "only_emission_measures": {
+            "measure": [
+                'Net carbon emissions',
+                'Net emissions/removals',
+                'Emissions from disposal',
+                'Emissions from manufacturing',
+                'Emissions from stocks',
+                'Indirect emissions',
+                'Direct emissions per MMS',
+                'Direct emissions per MMS - Anaerobic lagoon',
+                'Direct emissions per MMS - Composting',
+                'Direct emissions per MMS - Daily spread',
+                'Direct emissions per MMS - Digesters',
+                'Direct emissions per MMS - Liquid system',
+                'Direct emissions per MMS - Other',
+                'Direct emissions per MMS - Solid storage and dry lot',
+                'Indirect N2O emissions from atmospheric deposition',
+                'Indirect N2O emissions from nitrogen leaching and run-off',
+                'Net emissions/removals from HWP from domestic harvest',
+            ],
+        },
+    },
+    # define meta data
+    "meta_data": {
+        "references": "https://di.unfccc.int",
+        "title": "XXXX", # to set per country
+        "comment": "Data read from the UNFCCC DI flexible query interface using the API.",
+        "rights": "",
+        "contact": "mail@johannes-guetschow.de",
+        "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
+    },
+    # time format used in the input data
+    "time_format": "%Y",
+}
+
+cat_conversion = {
+    # ANNEXI to come (low priority as we read from CRF files)
+    'BURDI_to_IPCC2006_PRIMAP': {
+        'mapping': {
+            '1': '1',
+            '1.A': '1.A',
+            '1.A.1': '1.A.1',
+            '1.A.2': '1.A.2',
+            '1.A.3': '1.A.3',
+            '1.A.4': '1.A.4',
+            '1.A.5': '1.A.5',
+            '1.B': '1.B',
+            '1.B.1': '1.B.1',
+            '1.B.2': '1.B.2',
+            '2.A': '2.A',
+            '2.B': 'M.2.B_2.B',
+            '2.C': '2.C',
+            '2.D': 'M.2.H.1_2',
+            '2.E': 'M.2.B_2.E',
+            '2.F': '2.F',
+            '2.G': '2.H.3',
+            '4': 'M.AG',
+            '4.A': '3.A.1',
+            '4.B': '3.A.2',
+            '4.C': '3.C.7',
+            '4.D': 'M.3.C.45.AG',
+            '4.E': '3.C.1.c',
+            '4.F': '3.C.1.b',
+            '4.G': '3.C.8',
+            '5': 'M.LULUCF',
+            '6': '4',
+            '6.A': '4.A',
+            '6.B': '4.D',
+            '6.C': '4.C',
+            '6.D': '4.E',
+            '24540': '0',
+            '15163': 'M.0.EL',
+            '14637': 'M.BK',
+            '14424': 'M.BK.A',
+            '14423': 'M.BK.M',
+            '14638': 'M.BIO',
+            '7': '5',
+        }, #5.A-D ignored as not fitting 2006 cats
+        'aggregate': {
+            '2.B': {'sources': ['M.2.B_2.B', 'M.2.B_2.E'], 'name': 'Chemical Industry'},
+            '2.H': {'sources': ['M.2.H.1_2', '2.H.3'], 'name': 'Other'},
+            '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
+                  'name': 'Industrial Processes and Product Use'},
+            '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
+            '3.C.1': {'sources': ['3.C.1.b', '3.C.1.c'],
+                         'name': 'Emissions from biomass burning'},
+            'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
+                         'name': 'Emissions from biomass burning (Agriculture)'},
+            '3.C': {'sources': ['3.C.1', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
+                         'name': 'Aggregate sources and non-CO2 emissions sources on land'},
+            'M.3.C.AG': {'sources': ['M.3.C.1.AG', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
+                         'name': 'Aggregate sources and non-CO2 emissions sources on land ('
+                                 'Agriculture)'},
+            'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
+        },
+    },
+}
+
+di_processing_templates = {
+    # templates fro the DI processing. Most processing rules will apply to several
+    # versions. So we store them here and refer to them in the processing info dict
+    'BFA': {
+        'DI2022-08-22': { # remove 2007, seems to have summed sectors (Agri and LULUCF)
+            # and missing sectors (e.g. 1,2 for CH4, N2O)
+            'remove_years': ['2007'],
+        },
+    },
+    'BIH': {
+        'DI2022-08-22': {
+            # downscaling in two steps
+            # 1990-2001 has different coverage than 2002-2012 and 2013-2014
+            # do not downscale KyotoGHG for 1990-2001 as that's aggregated
+            # later to avoid inconsistencies
+            'downscale': {
+                'sectors': {
+                    '1.A_1990': {
+                        'basket': '1.A',
+                        'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4',
+                                            '1.A.5'],
+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '1.B_1990': {
+                        'basket': '1.B',
+                        'basket_contents': ['1.B.1', '1.B.2'],
+                        'entities': ['CH4', 'CO2', 'NMVOC', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '2_1990': {
+                        'basket': '2',
+                        'basket_contents': ['2.A', '2.B', '2.C', '2.D'],
+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '4_1990': {
+                        'basket': '4',
+                        'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
+                        'entities': ['CH4', 'N2O'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '5_1990': {
+                        'basket': '5',
+                        'basket_contents': ['5.A'],
+                        'entities': ['CO2'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                    '6_1990': {
+                        'basket': '6',
+                        'basket_contents': ['6.A'],
+                        'entities': ['CH4'],
+                        'dim': 'category (BURDI)',
+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
+                                         '1995', '1996', '1997', '1998', '1999',
+                                         '2000', '2001']},
+                        'skipna_evaluation_dims': None,
+                        'skipna': True,
+                    },
+                },
+                'entities': { # 2002-2014
+                    'KYOTO': {
+                        'basket': 'KYOTOGHG (SARGWP100)',
+                        'basket_contents': ['CH4', 'CO2', 'N2O'],
+                        'sel': {'category (BURDI)':
+                                    ['1' ,'1.A' ,'1.A.1', '1.A.2', '1.A.3', '1.A.4',
+                                     '1.A.5', '1.B', '1.B.1', '1.B.2', '2', '2.A',
+                                     '2.B', '2.C', '2.D', '2.E', '4', '4.A', '4.B',
+                                     '4.C', '4.D', '4.E', '5', '5.A', '6', '6.A',
+                                     '6.B', '6.C', '14423', '14424', '14637',
+                                     '15163', '24540',
+                                     ],
+                                'time': ['2002', '2003', '2004', '2005', '2006',
+                                         '2007', '2008', '2009', '2010', '2011',
+                                         '2012', '2013', '2014'],
+                                },
+                    },
+                },
+            },
+        },
+    },
+}
+
+di_processing_info = {
+    # only countries with special processing listet
+    # category conversion is defined on a country group level
+    # the 'default' option is used if no specific option is found such that
+    # processing of new versions can be done before creating a configuration for the
+    # version.
+    'BFA': {
+        'default': di_processing_templates['BFA']['DI2022-08-22'],
+        'DI2022-08-22': di_processing_templates['BFA']['DI2022-08-22'],
+    },
+    'BIH': {
+        'default': di_processing_templates['BIH']['DI2022-08-22'],
+        'DI2022-08-22': di_processing_templates['BIH']['DI2022-08-22'],
+    },
+}
+
+gas_baskets = {
+    'HFCS (SARGWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'OTHERHFCS (SARGWP100)'],
+    'HFCS (AR4GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'Unspecified mix of HFCs (AR4GWP100)'],
+    'HFCS (AR5GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
+                         'HFC407c', 'HFC410a', 'HFC4310mee'],
+    'PFCS (SARGWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
+    'PFCS (AR4GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',  'Unspecified mix of PFCs (AR4GWP100)'],
+    'PFCS (AR5GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
+    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
+    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
+    'FGASES (AR5GWP100)': ['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
+    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (SARGWP100)', 'PFCS (SARGWP100)'],
+    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)',
+                             'Unspecified mix of HFCs (AR4GWP100)', 'Unspecified mix of PFCs (AR4GWP100)'],
+    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR5GWP100)', 'PFCS (AR5GWP100)'],
+}

+ 1407 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py

@@ -0,0 +1,1407 @@
+import primap2 as pm2
+import unfccc_di_api
+import pandas as pd
+import numpy as np
+import pycountry
+import itertools
+import json
+import copy
+import xarray as xr
+import datalad.api
+import re
+from datalad.support.exceptions import IncompleteResultsError
+from datetime import date
+from typing import Optional, Dict, List, Union
+from pathlib import Path
+from copy import deepcopy
+from dask.base import tokenize
+
+from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_core import find_latest_date
+
+from .UNFCCC_DI_reader_config import di_to_pm2if_template_nai
+from .UNFCCC_DI_reader_config import di_to_pm2if_template_ai
+from .UNFCCC_DI_reader_config import di_query_filters
+from .UNFCCC_DI_reader_config import di_processing_info
+from .UNFCCC_DI_reader_config import cat_conversion
+from .UNFCCC_DI_reader_config import gas_baskets
+from .UNFCCC_DI_reader_config import cat_code_regexp
+from .util import NoDIDataError, nAI_countries, AI_countries
+from .util import DI_date_format, regex_date
+
+from UNFCCC_GHG_data.helper import custom_country_mapping
+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
+from UNFCCC_GHG_data.helper import extracted_data_path_UNFCCC, root_path, code_path
+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
+from UNFCCC_GHG_data.helper import convert_categories
+
+
+def read_UNFCCC_DI_for_country(
+        country_code: str,
+        category_groups: Optional[Dict]=None,
+        read_subsectors: bool=False,
+        save_data: Optional[bool]=True,
+        date_str: Optional[str]=None,
+        pm2if_specifications: Optional[dict]=None,
+        default_gwp: Optional[str]=None,
+        debug: Optional[bool]=False,
+):
+    """
+    reads data for a country from the UNFCCC DI interface and saves to native and
+    interchange format
+    """
+
+    # read the data
+    data_df = read_UNFCCC_DI_for_country_df(
+        country_code=country_code,
+        category_groups=category_groups,
+        read_subsectors=read_subsectors,
+        debug=debug,
+    )
+
+    # set date_str if not given
+    if date_str is None:
+        today = date.today()
+        date_str = today.strftime(DI_date_format)
+
+    # convert raw data to pm2 interchange format and save
+    data_if = convert_DI_data_to_pm2_if(
+        data=data_df,
+        pm2if_specifications=deepcopy(pm2if_specifications),
+        default_gwp=default_gwp,
+        date_str=date_str,
+        debug=debug,
+    )
+
+    # convert raw data to native pm2 format and save that
+    data_pm2 = convert_DI_IF_data_to_pm2(
+        data_di_if=data_if,
+    )
+
+    # save
+    if save_data:
+        save_DI_country_data(data_pm2, raw=True)
+
+    return data_pm2
+
+
+def process_and_save_UNFCCC_DI_for_country(
+        country_code: str,
+        date_str: Union[str, None]=None,
+) -> xr.Dataset:
+    '''
+    process data and save them to disk using default parameters
+    '''
+
+    # get latest dataset if no date given
+    if date_str is None:
+        # get the latest date
+        raw_data_file = find_latest_DI_data(country_code, raw=True)
+    else:
+        raw_data_file = determine_filename(country_code, date_str, raw=True,
+                                           hash=False)
+
+        raw_data_file = raw_data_file.parent / (raw_data_file.name + '.nc')
+        print(f"process {raw_data_file.name}")
+        if not raw_data_file.exists():
+            raise ValueError(f"File {raw_data_file.name} does not exist. Check if it "
+                             "has been read.")
+
+    # load the data
+    data_to_process = pm2.open_dataset(raw_data_file)
+
+    # get parameters
+    countries = list(data_to_process.coords[data_to_process.attrs['area']].values)
+    if len(countries) > 1:
+        raise ValueError(
+            f"Found {len(countries)} countries. Only single country data "
+            f"can be processed by this function. countries: {countries}")
+    else:
+        country_code = countries[0]
+    processing_info_country = di_processing_info[country_code]
+    entities_to_ignore = [] # TODO: check and make default list
+
+    # process
+    data_processed = process_UNFCCC_DI_for_country(
+        data_country=data_to_process,
+        entities_to_ignore=entities_to_ignore,
+        gas_baskets=gas_baskets,
+        cat_conversion=cat_conversion,
+        sectors=None,
+        processing_info_country=processing_info_country,
+    )
+
+    # save
+    save_DI_country_data(data_processed, raw=False)
+
+    return data_processed
+
+
+def process_UNFCCC_DI_for_country(
+        data_country: xr.Dataset,
+        entities_to_ignore: List[str],
+        gas_baskets: Dict[str, List[str]],
+        cat_conversion: Dict[str, Dict] = None,
+        sectors: List[str] = None,
+        processing_info_country: Dict = None,
+) -> xr.Dataset:
+    """
+        Process data from DI interface (where necessary).
+        * Downscaling including subtraction of time series
+        * country specific sector aggregation
+        * Conversion to IPCC2006 categories
+        * general sector and gas basket aggregation (in new categories)
+    """
+    #### 0: gather information
+    countries = list(data_country.coords[data_country.attrs['area']].values)
+    if len(countries) > 1:
+        raise ValueError(
+            f"Found {len(countries)} countries. Only single country data "
+            f"can be processed by this function. countries: {countries}")
+    else:
+        country_code = countries[0]
+
+    cat_col = data_country.attrs['cat']
+    temp = re.findall(r'\((.*)\)', cat_col)
+    cat_terminology_in = temp[0]
+
+    #### 1: general processing
+    # remove unused cats
+    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
+    # remove unused years
+    data_country = data_country.dropna(f'time', how='all')
+    # remove variables only containing nan
+    nan_vars_country = [var for var in data_country.data_vars if
+                        data_country[var].isnull().all().data == True]
+    data_country = data_country.drop_vars(nan_vars_country)
+
+    # remove unnecessary variables
+    entities_ignore_present = [entity for entity in entities_to_ignore if
+                               entity in data_country.data_vars]
+    data_country = data_country.drop_vars(entities_ignore_present)
+
+    #### 2: country specific processing
+
+
+    if processing_info_country is not None:
+        # get scenario
+        scenarios = list(data_country.coords[data_country.attrs['scen']].values)
+        if len(scenarios) > 1:
+            raise ValueError(
+                f"Found {len(scenarios)} scenarios. Only single scenario data "
+                f"can be processed by this function. Scenarios: {scenarios}")
+        else:
+            scenario = scenarios[0]
+            if scenario in processing_info_country.keys():
+                processing_info_country_scen = processing_info_country[scenario]
+            else:
+                processing_info_country_scen = processing_info_country['default']
+
+
+            if 'tolerance' in processing_info_country_scen:
+                tolerance = processing_info_country_scen["tolerance"]
+            else:
+                tolerance = 0.01
+
+            # take only desired years
+            if 'years' in processing_info_country_scen:
+                data_country = data_country.pr.loc[
+                    {'time': processing_info_country_scen['years']}]
+
+            # remove timeseries if desired
+            if 'remove_ts' in processing_info_country_scen:
+                for case in processing_info_country_scen['remove_ts']:
+                    remove_info = processing_info_country_scen['remove_ts'][case]
+                    entities = remove_info.pop("entities")
+                    for entity in entities:
+                        data_country[entity].pr.loc[remove_info] = \
+                            data_country[entity].pr.loc[remove_info] * np.nan
+
+            # remove all data for given years if necessary
+            if 'remove_years' in processing_info_country_scen:
+                data_country.pr.loc[{'time': processing_info_country_scen[
+                    'remove_years']}] = \
+                    data_country.pr.loc[{'time': processing_info_country_scen[
+                        'remove_years']}] * np.nan
+
+            # subtract categories
+            if 'subtract_cats' in processing_info_country_scen:
+                subtract_cats_current = processing_info_country_scen['subtract_cats']
+                if 'entities' in subtract_cats_current.keys():
+                    entities_current = subtract_cats_current['entities']
+                else:
+                    entities_current = list(data_country.data_vars)
+                print(f"Subtracting categories for country {country_code}, entities "
+                      f"{entities_current}")
+                for cat_to_generate in subtract_cats_current:
+                    cats_to_subtract = \
+                        subtract_cats_current[cat_to_generate]['subtract']
+                    data_sub = \
+                        data_country.pr.loc[{'category': cats_to_subtract}].pr.sum(
+                        dim='category', skipna=True, min_count=1)
+                    data_parent = data_country.pr.loc[
+                        {'category': subtract_cats_current[cat_to_generate]['parent']}]
+                    data_agg = data_parent - data_sub
+                    nan_vars = [var for var in data_agg.data_vars if
+                                data_agg[var].isnull().all().data == True]
+                    data_agg = data_agg.drop(nan_vars)
+                    if len(data_agg.data_vars) > 0:
+                        print(f"Generating {cat_to_generate} through subtraction")
+                        data_agg = data_agg.expand_dims([f'category ('
+                                                         f'{cat_terminology_in})'])
+                        data_agg = data_agg.assign_coords(
+                            coords={f'category ({cat_terminology_in})':
+                                        (f'category ({cat_terminology_in})',
+                                         [cat_to_generate])})
+                        data_country = data_country.pr.merge(data_agg,
+                                                             tolerance=tolerance)
+                    else:
+                        print(f"no data to generate category {cat_to_generate}")
+
+            # downscaling
+            if 'downscale' in processing_info_country_scen:
+                if 'sectors' in processing_info_country_scen['downscale']:
+                    sector_downscaling = \
+                        processing_info_country_scen['downscale']['sectors']
+                    for case in sector_downscaling.keys():
+                        print(f"Downscaling for {case}.")
+                        sector_downscaling_current = sector_downscaling[case]
+                        entities = sector_downscaling_current.pop('entities')
+                        for entity in entities:
+                            data_country[entity] = data_country[
+                                entity].pr.downscale_timeseries(
+                                **sector_downscaling_current)
+                            # , skipna_evaluation_dims=None)
+
+                if 'entities' in processing_info_country_scen['downscale']:
+                    entity_downscaling = \
+                        processing_info_country_scen['downscale']['entities']
+                    for case in entity_downscaling.keys():
+                        #print(case)
+                        print(data_country.coords[f'category ('
+                                                  f'{cat_terminology_in})'].values)
+                        data_country = data_country.pr.downscale_gas_timeseries(
+                            **entity_downscaling[case], skipna=True,
+                            skipna_evaluation_dims=None)
+
+            # aggregate categories
+            if 'aggregate_cats' in processing_info_country_scen:
+                aggregate_cats_current = processing_info_country_scen['aggregate_cats']
+                print(
+                    f"Aggregating categories for country {country_code}")
+                for cat_to_agg in aggregate_cats_current:
+                    print(f"Category: {cat_to_agg}")
+                    source_cats = aggregate_cats_current[cat_to_agg]['sources']
+                    data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
+                        dim='category', skipna=True, min_count=1)
+                    nan_vars = [var for var in data_agg.data_vars if
+                                data_agg[var].isnull().all().data == True]
+                    data_agg = data_agg.drop(nan_vars)
+                    if len(data_agg.data_vars) > 0:
+                        data_agg = data_agg.expand_dims([f'category ('
+                                                         f'{cat_terminology_in})'])
+                        data_agg = data_agg.assign_coords(
+                            coords={f'category ({cat_terminology_in})':
+                                        (f'category ({cat_terminology_in})',
+                                         [cat_to_agg])})
+                        data_country = data_country.pr.merge(data_agg,
+                                                             tolerance=tolerance)
+                    else:
+                        print(f"no data to aggregate category {cat_to_agg}")
+
+            # aggregate gases if desired
+            if 'aggregate_gases' in processing_info_country_scen:
+                for case in processing_info_country_scen['aggregate_gases'].keys():
+                    case_info = processing_info_country_scen['aggregate_gases'][case]
+                    data_country[case_info['basket']] = \
+                        data_country.pr.fill_na_gas_basket_from_contents(
+                            **case_info)
+
+    #### 3: map categories
+    if country_code in nAI_countries:
+        # conversion from BURDI to IPCC2006_PRIMAP needed
+        cat_terminology_out = 'IPCC2006_PRIMAP'
+        data_country = convert_categories(
+            data_country,
+            cat_conversion[f"{cat_terminology_in}_to_{cat_terminology_out}"],
+            cat_terminology_out,
+            debug=False,
+            tolerance=0.01,
+        )
+    else:
+        cat_terminology_out = cat_terminology_in
+
+    # more general processing
+    # reduce categories to output cats
+    if sectors is not None:
+        cats_to_keep = [cat for cat in
+                        data_country.coords[f'category ({cat_terminology_out})'].values if
+                        cat in sectors]
+        data_country = data_country.pr.loc[{'category': cats_to_keep}]
+
+    # create gas baskets
+    entities_present = set(data_country.data_vars)
+    for basket in gas_baskets.keys():
+        basket_contents_present = [gas for gas in gas_baskets[basket] if
+                                   gas in entities_present]
+        if len(basket_contents_present) > 0:
+            if basket in list(data_country.data_vars):
+                data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
+                    basket=basket, basket_contents=basket_contents_present, min_count=1)
+            else:
+                try:
+                    data_country[basket] = xr.full_like(data_country["CO2"],
+                                                        np.nan).pr.quantify(
+                        units="Gg CO2 / year")
+                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
+                                                  "gwp_context": basket.split(' ')[1][
+                                                                 1:-1]}
+                    data_country[basket] = data_country.pr.gas_basket_contents_sum(
+                        basket=basket, basket_contents=basket_contents_present,
+                        min_count=1)
+                except:
+                    print(f"No gas basket created for {country_code}")
+
+    # amend title and comment
+    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
+                                                                    f"{date.today()}"
+    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
+                                                                    f"{date.today()}"
+
+    return data_country
+
+
+def read_UNFCCC_DI_for_country_df(
+        country_code: str,
+        category_groups: Optional[Dict]=None,
+        read_subsectors: bool=False,
+        debug: Optional[bool]=False,
+)->pd.DataFrame:
+    """
+    read UNFCCC DI data for a given country. All data will be read
+    including all categories, gases, measures, and classifications
+    Filtering is done later on conversion to PRIMAP2 format
+
+    Parameters
+    ----------
+    country_code: str
+        ISO3 code of the country (country names don't work, use the wrapper function)
+
+    category_groups: dict (optional)
+        define which categories to read including filters on classification, measure,
+        gases
+
+        cat_groups = {
+            "4.A  Enteric Fermentation": { #4.A  Enteric Fermentation[14577]
+                "measure": [
+                    'Net emissions/removals',
+                    'Total population',
+                ],
+                "gases": ["CH4"],
+            },
+        }
+
+    Returns
+    -------
+    pd.DataFrame with read data
+
+    """
+    reader = unfccc_di_api.UNFCCCApiReader()
+
+    # template for the query to the DI API
+    query_template = {
+        "party_codes": [country_code],
+        "normalize_gas_names": True
+    }
+
+    # find country group
+    if country_code in nAI_countries:
+        ai_country = False
+    elif country_code in AI_countries:
+        ai_country = True
+        #di_data = reader.annex_one_reader.query(**query)
+    else:
+        raise ValueError(f"Country code {country_code} found neither in AnnexI nor "
+                         f"non-AnnexI country lists.")
+
+    if category_groups is None:
+        # no category defs given, so use default which is all categories,
+        # all gases, but no other data
+        if debug:
+            print(f"Using default config to read for country {country_code}")
+        if ai_country:
+            all_gases = reader.annex_one_reader.gases["name"]
+            query = query_template
+            query["gases"] = list(set(all_gases) - {"No gas"})
+            if debug:
+                print(f"Using query: {query}")
+            di_data = reader.annex_one_reader.query(**query)
+        else:
+            all_gases = reader.non_annex_one_reader.gases["name"]
+            query = query_template
+            query["gases"] = list(set(all_gases) - {"No gas"})
+            if debug:
+                print(f"Using query: {query}")
+            di_data = reader.non_annex_one_reader.query(**query)
+    else:
+        # detailed query per category (could also be just the top level cat)
+
+        # read available categories and measures
+        if ai_country:
+            categories = reader.annex_one_reader.category_tree.all_nodes()
+            measures = reader.annex_one_reader.measure_tree.all_nodes()
+        else:
+            categories = reader.non_annex_one_reader.category_tree.all_nodes()
+            measures = reader.non_annex_one_reader.measure_tree.all_nodes()
+
+                # set data to none so we have the variable for the first category
+        di_data = None
+
+
+        for category in category_groups:
+            if debug:
+                print(f"Working on {category}")
+            this_cat_config = category_groups[category]
+            # category specific query
+            query = query_template.copy()
+            for filter in di_query_filters:
+                if filter in this_cat_config.keys():
+                    query[filter] = this_cat_config[filter]
+
+            # get the category nodes with the given tag (might be multiple)
+            cat_nodes = [cat_node for cat_node in categories if cat_node.tag == category]
+            if debug:
+                print(f"Found fitting category nodes: {cat_nodes}")
+            node_codes = []
+            for node in cat_nodes:
+                if "read_subsectors" in this_cat_config.keys():
+                    read_subsectors_this_cat = this_cat_config["read_subsectors"]
+                else:
+                    read_subsectors_this_cat = read_subsectors
+                if read_subsectors_this_cat:
+                    # get the subcategories
+                    sub_nodes = reader.non_annex_one_reader.category_tree.subtree(
+                        nid=node.identifier).all_nodes()
+                    node_codes = node_codes + (
+                        [sub_node.identifier for sub_node in sub_nodes])
+                else:
+                    node_codes = node_codes + [node.identifier]
+            if debug:
+                print(f"Found node_codes: {node_codes}")
+            # add category node_codes to query
+            query["category_ids"] = node_codes
+
+            if "measure" in this_cat_config:
+                measure_nodes = [
+                    measure_node for measure_node in measures if
+                    measure_node.tag in this_cat_config["measure"]]
+                if debug:
+                    print(f"Found measure_nodes: {measure_nodes}")
+                # add measure nodes to query
+                query["measure_ids"] = [node.identifier for node in measure_nodes]
+            if debug:
+                print(query)
+
+            # read the data. If no data is available for the query the error is caught and a message is printed
+            try:
+                if ai_country:
+                    data_new = reader.annex_one_reader.query(**query)
+                else:
+                    data_new = reader.non_annex_one_reader.query(**query)
+
+                n_points = len(data_new)
+                n_countries = len(data_new["party"].unique())
+                if debug:
+                    print(f"Collected {n_points} data points for {n_countries} countries")
+                if di_data is None:
+                    di_data = data_new
+                else:
+                    di_data = pd.concat([di_data, data_new])
+            except unfccc_di_api.NoDataError:
+                print(f"No data for {category}")
+
+    # if data has been collected print some information and save the data
+    if di_data is None:
+        raise ValueError(f"No data collected for country {country_code} and category "
+                         f"groups "
+                         f"{category_groups}")
+    elif debug:
+        # print some information on collected data
+        print(f"Collected data for country {country_code}")
+        print("### Categories ###")
+        categories = di_data["category"].unique()
+        categories.sort()
+        print(categories)
+        print("### Classifications ###")
+        classifications = di_data["classification"].unique()
+        classifications.sort()
+        print(classifications)
+        print("### Measures ###")
+        measures = di_data["measure"].unique()
+        measures.sort()
+        print(measures)
+
+    return di_data
+
+
+def convert_DI_data_to_pm2_if(
+        data: pd.DataFrame,
+        pm2if_specifications: Optional[dict]=None,
+        default_gwp: Optional[str]=None,
+        date_str: Optional[str]=None,
+        debug: bool = False,
+) -> pd.DataFrame:
+    """
+    Convert data returned from the unfccc_di_api package to primap2 interchange format
+
+    TODO: consider moving the specification template into this function and just use the config parameter
+    to overwrite certain parameters (makes sense if function is used in a broader context
+    """
+
+    print("Convert data to PRIMAP2 interchange format")
+
+    # create a copy of the data to avoid data altering the original data
+    # this will be done inside the *convert_to_long_dataframe* function
+    # in the future. Thus it can be removed here once the category column
+    # copy workaround is no longer necessary
+    data_temp = data.copy(deep=True)
+
+    # check which country group we have
+    reader = unfccc_di_api.UNFCCCApiReader()
+    parties_present_ai = [party for party in data_temp["party"].unique() if party
+                          in AI_countries]
+    parties_present_nai = [party for party in data_temp["party"].unique() if party
+                          in nAI_countries]
+    if len(parties_present_ai) > 0:
+        if len(parties_present_nai) > 0:
+            raise ValueError("AnnexI and non-AnnexI parties present in one dataset. "
+                             "This is not possible due to different DI category "
+                             "terminologies. Convert to common categories.")
+        else:
+            ai_dataset = True
+    else:
+        ai_dataset=False
+
+    if pm2if_specifications is None:
+        if ai_dataset:
+            pm2if_specifications = deepcopy(di_to_pm2if_template_ai)
+        else:
+            pm2if_specifications = deepcopy(di_to_pm2if_template_nai)
+
+    # modify specifications
+    #pm2if_specifications["filter_remove"].update(filter_activity_factors)
+
+    # set the scenario to today's date if not given explicitly
+    if date_str == "country":
+        pm2if_specifications["coords_defaults"]["scenario"] = f"DIrolling"
+    elif date_str is None:
+        today = date.today()
+        date_str = today.strftime(DI_date_format)
+    pm2if_specifications["coords_defaults"]["scenario"] = f"DI{date_str}"
+
+    # set metadata
+    countries = data["party"].unique()
+    if len(countries) > 1:
+        pm2if_specifications["meta_data"]["title"] = \
+            f"Data submitted to the UNFCCC by countries {countries} as " \
+            f"available in the DI interface on {date_str}."
+    else:
+        try:
+            country_info = pycountry.countries.get(alpha_3=countries[0])
+            country_name = country_info.name
+        except:
+            country_name = countries[0]
+
+        pm2if_specifications["meta_data"]["title"] = \
+            f"Data submitted to the UNFCCC by country {country_name} as " \
+            f"available in the DI interface on {date_str}."
+
+    pm2if_specifications["meta_data"]["comment"] = \
+        pm2if_specifications["meta_data"]["comment"] + f" Data read on {date_str}."
+
+    # remove baseyear
+    idx_base_year = data_temp["year"] == "Base year"
+    data_temp = data_temp.drop(data_temp.index[idx_base_year])
+
+    # add GWP to entities where necessary
+    data_temp["unit"] = data_temp["unit"].replace(to_replace=r"(.*) CO2 equivalent",
+                                                value=r"\1CO2eq", regex=True)
+    row_idx_co2eq = data_temp["unit"].str.endswith("CO2eq")
+    if default_gwp is not None:
+        # convert all with GWPs given in input
+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
+                                              " (SARGWP100)"
+    elif ai_dataset:
+        # convert with AR4
+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
+                                              " (AR4GWP100)"
+    else:
+        # convert with SAR
+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
+                                              " (SARGWP100)"
+
+    # combine numeric and string values
+    nan_idx = data_temp["numberValue"].isna()
+    data_temp.loc[nan_idx, "numberValue"] = data_temp.loc[nan_idx, "stringValue"]
+    data_temp = data_temp.drop(columns=["stringValue"])
+
+    # Currently in primap2 a data reading a column can only be used once.
+    # We want to use the category column both for the primap2 "category"
+    # column (which contains the code only) and an additional column which stores
+    # the full name as available from the DI API. As a workaround we create a
+    # copy of the category column
+    if not ai_dataset:
+        data_temp["category_copy"] = data_temp["category"]
+
+    # replace category name and code by just the code
+    repl = lambda m: m.group('code')
+    data_temp["category"] = data_temp["category"].str.replace(cat_code_regexp, repl,
+                                                              regex=True)
+
+    # convert to pm2 interchange format
+    data_pm2if = pm2.pm2io.convert_long_dataframe_if(
+        data_temp,
+        **pm2if_specifications,
+    )
+
+    return data_pm2if
+
+
+def convert_DI_IF_data_to_pm2(
+        data_di_if: pd.DataFrame,
+)-> xr.Dataset:
+    if_index_cols = set(itertools.chain(*data_di_if.attrs["dimensions"].values()))
+    time_cols = set(data_di_if.columns.values) - if_index_cols
+    data_di_if.dropna(subset=time_cols, inplace=True, how="all")
+
+    try:
+        # use a copy as from_interchange_format modifies the input DF
+        data_pm2 = pm2.pm2io.from_interchange_format(
+            data_di_if.copy(deep=True), attrs=copy.deepcopy(data_di_if.attrs))
+    except Exception as ex: # better more specific error in primap2
+        print(f'Error on conversion to PRIMAP2 native format: {ex}')
+
+    return data_pm2
+
+
+def save_DI_country_data(
+        data_pm2: xr.Dataset,
+        raw: bool=True,
+):
+    '''
+    save primap2 and IF data to country folder
+    can be used for raw and processed data but for a single country only
+    '''
+
+    # preparations
+    data_if = data_pm2.pr.to_interchange_format()
+
+    ## get country
+    countries = data_if[data_pm2.attrs['area']].unique()
+    if len(countries) > 1:
+        raise ValueError(f"More than one country in input data. This function can only"
+                         f"handle single country data. Countries: {countries}")
+    else:
+        country_code = countries[0]
+
+    ## get timestamp
+    scenario_col = data_pm2.attrs['scen']
+    scenarios = data_if[scenario_col].unique()
+    if len(scenarios) > 1:
+        raise ValueError(f"More than one scenario in input data. This function can only"
+                         f"handle single scenario data. Scenarios: {scenarios}")
+    else:
+        scenario = scenarios[0]
+
+    date_str = scenario[2:]
+
+    # calculate the hash of the data to see if it's identical to present data
+    data_for_token = data_if.drop(columns=[scenario_col])
+    token = tokenize(data_for_token)
+
+    # get the filename with the hash and check if it exists (separate for pm2 format
+    # and IF to fix broken datasets if necessary)
+    filename_hash = root_path / determine_filename(country_code, token, raw, hash=True)
+
+    # primap2 native format
+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
+    if not filename_hash_nc.exists():
+        # if parent dir does not exist create it
+        if not filename_hash.parent.exists():
+            filename_hash.parent.mkdir()
+        # save the data
+        print(f"Data has changed. Save to {filename_hash_nc.name}")
+        compression = dict(zlib=True, complevel=9)
+        encoding = {var: compression for var in data_pm2.data_vars}
+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
+
+    # primap2 IF
+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
+    if not filename_hash_csv.exists():
+        # save the data
+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
+    else:
+        print(f"Data unchanged for {country_code}. Create symlinks.")
+
+    # get the filename with the date
+    filename_date = root_path / determine_filename(country_code, date_str, raw)
+
+    # create the symlinks to the actual data (with the hash)
+    suffixes = ['.nc', '.csv', '.yaml']
+    for suffix in suffixes:
+        file_date = filename_date.parent / (filename_date.name + suffix)
+        file_hash = filename_hash.name + suffix
+        if file_date.exists():
+            file_date.unlink()
+        file_date.symlink_to(file_hash)
+
+
+def save_DI_dataset(
+        data_pm2: xr.Dataset,
+        raw: bool=True,
+        annexI: bool=False,
+):
+    '''
+    save primap2 and IF data to dataset folder
+    can be used for raw and processed data but not to save to country folders
+    '''
+
+    # preparations
+    data_if = data_pm2.pr.to_interchange_format()
+    if annexI:
+        country_group = "AnnexI"
+    else:
+        country_group = "non-AnnexI"
+
+
+    ## get timestamp
+    scenario_col = data_pm2.attrs['scen']
+    scenarios = data_if[scenario_col].unique()
+    if len(scenarios) > 1:
+        raise ValueError(f"More than one scenario in input data. This function can only"
+                         f"handle single scenario data. Scenarios: {scenarios}")
+    else:
+        scenario = scenarios[0]
+
+    date_str = scenario[2:]
+
+    # calculate the hash of the data to see if it's identical to present data
+    data_for_token = data_if.drop(columns=[scenario_col])
+    token = tokenize(data_for_token)
+
+    # get the filename with the hash and check if it exists (separate for pm2 format
+    # and IF to fix broken datasets if necessary)
+    filename_hash = root_path / determine_dataset_filename(token, raw, annexI=annexI,
+                                               hash=True)
+    # primap2 native format
+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
+    if not filename_hash_nc.exists():
+        # if parent dir does not exist create it
+        # TODO double, also in determine_dataset_filename. same for country data
+        if not filename_hash.parent.exists():
+            filename_hash.parent.mkdir()
+        # save the data
+        print(f"Data has changed. Save to {filename_hash_nc.name}")
+        compression = dict(zlib=True, complevel=9)
+        encoding = {var: compression for var in data_pm2.data_vars}
+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
+
+    # primap2 IF
+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
+    if not filename_hash_csv.exists():
+        # save the data
+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
+    else:
+        print(f"Data unchanged for {country_group}. Create symlinks.")
+
+    # get the filename with the date
+    filename_date = root_path / determine_dataset_filename(date_str, raw=raw,
+                                               annexI=annexI, hash=False)
+
+    # create the symlinks to the actual data (with the hash)
+    suffixes = ['.nc', '.csv', '.yaml']
+    for suffix in suffixes:
+        file_date = filename_date.parent / (filename_date.name + suffix)
+        file_hash = filename_hash.name + suffix
+        if file_date.exists():
+            file_date.unlink()
+        file_date.symlink_to(file_hash)
+
+
+## functions for multiple country reading
+def read_UNFCCC_DI_for_country_group(
+        annexI: bool=False,
+) -> xr.Dataset:
+    '''
+    This function reads DI data for all countries in a group (annexI or non-AnnexI)
+    The function reads all data in one go using datalad run. as the output data file
+    names are unknown beforehand datalad run uses explicit=false
+    '''
+
+    today = date.today()
+    date_str = today.strftime(DI_date_format)
+
+    if annexI:
+        countries = AI_countries
+        data_all_if = None
+        country_group = "AnnexI"
+    else:
+        countries = nAI_countries
+        data_all = None
+        country_group = "non-AnnexI"
+
+    # read the data
+    for country in countries:
+        print(f"reading DI data for country {country}")
+
+        try:
+            data_country = read_UNFCCC_DI_for_country(
+                country_code=country,
+                category_groups=None,  # read all categories
+                read_subsectors=False,  # not applicable as we read all categories
+                date_str=date_str,
+                pm2if_specifications=None,
+                # automatically use the right specs for AI and NAI
+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
+                debug=False)
+
+            if annexI:
+            # annexI data has additional dimensions and unfortunately the xarray
+            # merge function needs some extra memory which is not needed when
+            # converting from IF to pm2
+                if data_all_if is None:
+                    data_all_if = data_country.pr.to_interchange_format()
+                    attrs = data_all_if.attrs
+                else:
+                    data_all_if = pd.concat([data_all_if,
+                                          data_country.pr.to_interchange_format()])
+            else:
+                if data_all is None:
+                    data_all = data_country
+                else:
+                    data_all = data_all.pr.merge(data_country)
+
+        except unfccc_di_api.NoDataError as err:
+            print(f"No data for {country}.")
+            print(err)
+
+    if annexI:
+        data_all = pm2.pm2io.from_interchange_format(data_all_if, attrs=attrs,
+                                                     max_array_size=500000000000)
+
+    countries_present = list(data_all.coords[data_all.attrs['area']].values)
+    data_all.attrs["title"] = f"Data submitted by the following {country_group} " \
+                              f"countries and available in the DI interface on " \
+                              f"{date_str}: {', '.join(countries_present)}"
+
+    # save the data
+    save_DI_dataset(data_all, raw=True, annexI=annexI)
+
+    return data_all
+
+
+def process_UNFCCC_DI_for_country_group(
+        annexI: bool=False,
+) -> xr.Dataset:
+    '''
+    This function processes DI data for all countries in a group (annexI or non-AnnexI)
+    TODO: currently only non-annexI is implemented
+    The function processes all data in one go using datalad run. as the output data file
+    names are unknown beforehand datalad run uses explicit=false
+
+    TODO: use the latest
+
+
+    '''
+
+    today = date.today()
+    date_str = today.strftime(DI_date_format)
+
+    if annexI:
+        raise ValueError("Bulk reading for AnnexI countries not implemented yet")
+    else:
+        countries = nAI_countries
+
+    # read the data
+    data_all = None
+    for country in countries[0:5]:
+        print(f"reading DI data for country {country}")
+
+        try:
+            data_country = read_UNFCCC_DI_for_country(
+                country_code=country,
+                category_groups=None,  # read all categories
+                read_subsectors=False,  # not applicable as we read all categories
+                date_str=date_str,
+                pm2if_specifications=None,
+                # automatically use the right specs for AI and NAI
+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
+                debug=False)
+
+            if data_all is None:
+                data_all = data_country
+            else:
+                data_all = data_all.pr.merge(data_country)
+        except unfccc_di_api.NoDataError as err:
+            print(f"No data for {country}.")
+            print(err)
+
+    # TODO: write metadata
+
+    # save the data
+    save_DI_dataset(data_all, raw=True, annexI=annexI)
+
+    return data_all
+
+# TODO: add interface functions and script for read all data
+# add process all sfunctios and scripts
+# merge into main
+# rund reading procedure
+# config for all DI data
+# re-run crf etc
+
+
+## datalad and pydoit interface functions
+def read_DI_for_country_datalad(
+        country: str,
+) -> None:
+    """
+    Wrapper around read_UNFCCC_DI_for_country which takes care of selecting input
+    and output files and using datalad run to trigger the data reading
+
+    Parameters
+    __________
+
+    country: str
+        country name or ISO 3-letter country code
+
+    """
+
+    # get date to determine output filename
+    today = date.today()
+    date_str = today.strftime(DI_date_format)
+
+    # get all the info for the country
+    country_info = get_input_and_output_files_for_country_DI(country, date_str,
+                                                             raw=True, verbose=True)
+
+    print(f"Attempting to read DI data for {country_info['name']}.")
+    print("#"*80)
+    print("")
+    print(f"Using the UNFCCC_DI_reader")
+    print("")
+    print(f"Run the script using datalad run via the python api")
+    script = code_path / "UNFCCC_DI_reader" / "read_UNFCCC_DI_for_country.py"
+    script = script.relative_to(root_path)
+
+    cmd = f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} " \
+          f"--date={date_str}"
+    try:
+        datalad.api.run(
+            cmd=cmd,
+            dataset=root_path,
+            message=f"Read DI data for {country_info['name']}.",
+            inputs=country_info["input"],
+            outputs=country_info["output"],
+            dry_run=None,
+            explicit=False,
+        )
+    except IncompleteResultsError as IRE:
+        print(f"IncompleteResultsError occured when running {cmd}: {IRE}")
+    except Exception as ex:
+        print(f"Exception occurred when running {cmd}")
+        print(ex.message)
+
+
+def process_DI_for_country_datalad(
+        country: str,
+        date_str: Union[str, None],
+) -> None:
+    """
+    Wrapper around process_UNFCCC_DI_for_country which takes care of selecting input
+    and output files and using datalad run to trigger the data processing
+
+    Parameters
+    __________
+
+    country: str
+        country name or ISO 3-letter country code
+    date_str: str
+        Date of the data to be processed in the format %Y-%m-%d (e.g. 2023-01-30). If
+        no date is given the last data read will be processed.
+    """
+
+    # get all the info for the country
+    country_info = get_input_and_output_files_for_country_DI(country, date_str,
+                                                             raw=True, verbose=True)
+
+    print(f"Attempting to process DI data for {country_info['name']}.")
+    print("#"*80)
+    print("")
+    print(f"Using the UNFCCC_DI_reader")
+    print("")
+    print(f"Run the script using datalad run via the python api")
+    script = code_path / "UNFCCC_DI_reader" / "process_UNFCCC_DI_for_country.py"
+    script = script.relative_to(root_path)
+
+    cmd = f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} " \
+          f"--date={date_str}"
+    try:
+        datalad.api.run(
+            cmd=cmd,
+            dataset=root_path,
+            message=f"Read DI data for {country_info['name']}.",
+            inputs=country_info["input"],
+            outputs=country_info["output"],
+            dry_run=None,
+            explicit=False,
+        )
+    except IncompleteResultsError as IRE:
+        print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
+    except Exception as ex:
+        print(f"Exception occurred when running {cmd}")
+        print(ex.message)
+
+
+def read_DI_for_country_group_datalad(
+        annexI: bool=False,
+) -> None:
+    """
+    Wrapper around read_UNFCCC_DI_for_country_group which takes care of selecting input
+    and output files and using datalad run to trigger the data processing
+
+    Parameters
+    __________
+
+    country: str
+        country name or ISO 3-letter country code
+    date_str: str
+        Date of the data to be processed in the format %Y-%m-%d (e.g. 2023-01-30). If
+        no date is given the last data read will be processed.
+    """
+
+    if annexI:
+        country_group = "AnnexI"
+    else:
+        country_group = "non-AnnexI"
+
+    print(f"Attempting to read DI data for {country_group}.")
+    print("#"*80)
+    print("")
+    print(f"Using the UNFCCC_DI_reader")
+    print("")
+    print(f"Run the script using datalad run via the python api")
+    script = code_path / "UNFCCC_DI_reader" / "read_UNFCCC_DI_for_country_group.py"
+    script = script.relative_to(root_path)
+
+    cmd = f"./venv/bin/python3 {script.as_posix()} "
+    if annexI:
+        cmd = cmd + f" --annexI"
+
+    try:
+        datalad.api.run(
+            cmd=cmd,
+            dataset=root_path,
+            message=f"Read DI data for {country_group}.",
+            inputs=[],
+            outputs=[],
+            dry_run=None,
+            explicit=False,
+        )
+    except IncompleteResultsError as IRE:
+        print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
+    except Exception as ex:
+        print(f"Exception occurred when running {cmd}")
+        print(ex.message)
+
+
+## helper functions
+def determine_filename(
+        country_code: str,
+        date_or_hash: str,
+        raw: bool=False,
+        hash: bool=False,
+) -> Path:
+    """
+    Determine the filename for a dataset from given country code and date string.
+
+
+    Parameters
+    ----------
+    country_code: str
+        ISO 3 letter code of the country
+    date_or_hash:
+        formatted date string
+    raw: bool
+        bool specifying if filename fow raw or processed data should be returned
+    hash: str
+
+    Returns
+    _______
+        pathlib Path object for the file name (without suffix)
+
+    """
+
+    # get the country folder
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code in folder_mapping:
+        file_filter = {}
+        file_filter["party"] = country_code
+        country_folders = folder_mapping[country_code]
+        if isinstance(country_folders, str):
+            # only one folder
+            country_folder = extracted_data_path_UNFCCC / country_folders
+        else:
+            raise ValueError("More than one output folder for country "
+                             f"{country_code}. This should not happen.")
+    else:
+        # folder not in mapping. It will be created if not present yet
+        country_name = get_country_name(country_code)
+        country_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
+
+        if country_folder.exists():
+           print(f"Output folder {country_name.replace(' ', '_')} for country "
+                 f"{country_code} exists but is not in folder mapping. Update "
+                 "folder mapping")
+        else:
+            country_folder.mkdir()
+
+    filename = f"{country_code}_DI_{date_or_hash}"
+    if raw:
+        filename = f"{filename}_raw"
+    if hash:
+        filename = f"{filename}_hash"
+    filename = country_folder / filename
+
+    return filename.relative_to(root_path)
+
+
+def determine_dataset_filename(
+        date_or_hash: str,
+        raw: bool=False,
+        annexI: bool=False,
+        hash: bool = False,
+) -> Path:
+    """
+    Determine the filename for a dataset from given country group and date string.
+
+    Parameters
+    ----------
+    date_or_hash:
+        formatted date string
+    raw: bool
+        bool specifying if filename fow raw or processed data should be returned
+    annexI: bool, default False
+        True if AnnexI data, False if non-AnnexI data
+    hash: str
+
+    Returns
+    _______
+        pathlib Path object for the file name (without suffix)
+    """
+
+    # get the country folder
+    if annexI:
+        current_dataset_path = dataset_path_UNFCCC / "DI_AnnexI"
+        filename = f"DI_AnnexI_{date_or_hash}"
+    else:
+        current_dataset_path = dataset_path_UNFCCC / "DI_non_AnnexI"
+        filename = f"DI_non_AnnexI_{date_or_hash}"
+
+    if not current_dataset_path.exists():
+        current_dataset_path.mkdir()
+
+    if raw:
+        filename = f"{filename}_raw"
+    if hash:
+        filename = f"{filename}_hash"
+    filename = current_dataset_path / filename
+
+    return filename.relative_to(root_path)
+
+
+def get_input_and_output_files_for_country_DI(
+        country: str,
+        date_str: str,
+        raw: bool,
+        verbose: Optional[bool]=True,
+) -> Dict[str, Union[List, str]]:
+    """
+    Get input and output files for a given country
+    """
+
+    country_info = {}
+
+    if country in custom_country_mapping:
+        country_code = country
+    else:
+        country_code = get_country_code(country)
+    # now get the country name
+    country_name = get_country_name(country_code)
+    country_info["code"] = country_code
+    country_info["name"] = country_name
+    # now get the country name
+    country_name = get_country_name(country_code)
+    country_info["code"] = country_code
+    country_info["name"] = country_name
+
+    # determine latest data
+    print(f"Determining output files for {country_name}")
+
+    # get input files (only for processing)
+    if raw:
+        input_files = []
+    else:
+        # get latest dataset if no date given
+        if date_str is None:
+            # get the latest date
+            input_file = [find_latest_DI_data(country_code, raw=True)]
+        else:
+            input_file = [determine_filename(country_code, date_str, raw=False,
+                                               hash=False)]
+            if input_file[0].is_symlink():
+                # also get the file with the actual data
+                input_file.append(input_file[0].readlink())
+            else:
+                # DI processing input files wit date labels should always be symlinks
+                # to the files with hashes holding the actual data.
+                raise(ValueError, f"Input file {input_file[0].name} is not a symlink "
+                                  f" or not existent. Check if the data you want to "
+                                  f"process exists and if your repository is ")
+
+        input_files = [f"{input_file.as_posix()}.{suffix}" for
+                        suffix in ['yaml', 'csv', 'nc']]
+
+        if verbose:
+            print(f"The following files are considered as input_files:")
+            for file in input_files:
+                print(file)
+            print("")
+
+    # get output files
+    output_file = determine_filename(country_code, date_str, raw=raw)
+    output_files = [f"{output_file.as_posix()}.{suffix}" for
+                    suffix in ['yaml', 'csv', 'nc']]
+
+    if verbose:
+        print(f"The following files are considered as output_files:")
+        for file in output_files:
+            print(file)
+        print("")
+
+    # add to country info
+    country_info["input"] = input_files
+    country_info["output"] = [] #output_files # not used because we don't know the
+    # hash in advance
+
+    return country_info
+
+
+def get_present_hashes_for_country_DI(
+        country_code: str,
+        raw: bool,
+) -> List:
+    '''
+    Get the hashes of outputs
+    '''
+
+    regex_hash = r"_([a-f0-9]*)_"
+    if raw:
+        regex_hash = regex_hash + "raw_hash\.nc"
+    else:
+        regex_hash = regex_hash + "hash\.nc"
+
+    # get the country folder
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code in folder_mapping:
+        file_filter = {}
+        file_filter["party"] = country_code
+        country_folders = folder_mapping[country_code]
+        if isinstance(country_folders, str):
+            # only one folder
+            country_folder = extracted_data_path_UNFCCC / country_folders
+        else:
+            raise ValueError("More than one output folder for country "
+                             f"{country_code}. This should not happen.")
+
+        files_list = list(country_folder.glob("*_hash.nc"))
+        # filter according to raw flag
+        if raw:
+            files_list = [file.name for file in files_list if
+                          re.search(r'_raw_hash', file.name)]
+        else:
+            files_list = [file.name for file in files_list if
+                          not re.search(r'_raw_hash', file.name)]
+
+        hash_list = [re.findall(regex_hash, file)[0] for file in files_list]
+        return hash_list
+
+    else:
+        # folder not in mapping.
+        return []
+
+
+def find_latest_DI_data(
+        country_code: str,
+        raw: bool=True,
+)->Union[Path, None]:
+    '''
+    Find the path to the nc file with the latest DI data for a given country
+    '''
+
+    if raw:
+        regex = regex_date + r"_raw\.nc"
+    else:
+        regex = regex_date + r"\.nc"
+
+    # get the country folder
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code in folder_mapping:
+        file_filter = {}
+        file_filter["party"] = country_code
+        country_folders = folder_mapping[country_code]
+        if isinstance(country_folders, str):
+            # only one folder
+            country_folder = extracted_data_path_UNFCCC / country_folders
+        else:
+            raise ValueError("More than one output folder for country "
+                             f"{country_code}. This should not happen.")
+
+        files_path_list = list(country_folder.glob("*.nc"))
+        # remove files with hash
+        files_list = [file.name for file in files_path_list
+                      if not re.search(r'_hash\.nc', file.name)]
+        # filter according to raw flag
+        if raw:
+            files_list = [file for file in files_list if
+                          re.search(r'_raw\.nc', file)]
+        else:
+            files_list = [file for file in files_list if
+                          not re.search(r'_raw\.nc', file)]
+
+        if len(files_list) > 0:
+            date_list = [re.findall(regex, file)[0] for file in files_list]
+            latest_date = find_latest_date(date_list, '%Y-%m-%d')
+            latest_file = [file for file in files_path_list if re.search(latest_date,
+                                                                         file.name)][0]
+            return latest_file
+        else:
+            return None
+
+    else:
+        # folder not in mapping.
+        return None
+
+# TODO
+
+# functions
+
+# def compare_with_existing
+# def

+ 23 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py

@@ -0,0 +1,23 @@
+# submodule to read data from UNFCCC DI API using the unfccc_di_api package
+
+#import unfccc_di_api
+from .UNFCCC_DI_reader_core import \
+    read_UNFCCC_DI_for_country, read_DI_for_country_datalad, \
+    process_UNFCCC_DI_for_country, process_and_save_UNFCCC_DI_for_country, \
+    process_DI_for_country_datalad, \
+    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename, \
+    read_UNFCCC_DI_for_country_group, read_DI_for_country_group_datalad
+
+
+__all__ = [
+    "read_UNFCCC_DI_for_country",
+    "read_DI_for_country_datalad",
+    "process_UNFCCC_DI_for_country",
+    "process_and_save_UNFCCC_DI_for_country",
+    "process_DI_for_country_datalad",
+    "convert_DI_data_to_pm2_if",
+    "convert_DI_IF_data_to_pm2",
+    "determine_filename",
+    "read_UNFCCC_DI_for_country_group",
+    "read_DI_for_country_group_datalad",
+]

+ 26 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country.py

@@ -0,0 +1,26 @@
+"""
+This script is a wrapper around the read__for_country
+function such that it can be called from datalad
+"""
+
+import argparse
+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
+    process_and_save_UNFCCC_DI_for_country
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country code')
+parser.add_argument('--date', help='String with date to read and process. If not '
+                                   'given latest data will be used')
+args = parser.parse_args()
+
+country_code = args.country
+date_str = args.date
+
+if date_str == "None":
+    date_str = None
+
+process_and_save_UNFCCC_DI_for_country(
+    country_code=country_code,
+    date_str=date_str,
+)

+ 22 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py

@@ -0,0 +1,22 @@
+"""
+wrapper around read_crf_for_country_datalad such that it can be called
+from doit in the current setup where doit runs on system python and
+not in the venv.
+"""
+
+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
+    process_DI_for_country_datalad
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country name or code')
+parser.add_argument('--date', help='String with date to read and process. If not '
+                                   'given latest data will be used')
+args = parser.parse_args()
+country = args.country
+date_str = args.date
+
+if date_str == "None":
+    date_str = None
+
+process_DI_for_country_datalad(country, date_str=date_str)

+ 27 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country.py

@@ -0,0 +1,27 @@
+"""
+This script is a wrapper around the read__for_country
+function such that it can be called from datalad
+"""
+
+import argparse
+from UNFCCC_GHG_data.UNFCCC_DI_reader.UNFCCC_DI_reader_core import \
+    read_UNFCCC_DI_for_country
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country code')
+parser.add_argument('--date', help='String with current date')
+args = parser.parse_args()
+
+country_code = args.country
+date_str = args.date
+
+read_UNFCCC_DI_for_country(
+    country_code=country_code,
+    category_groups=None, # read all categories
+    read_subsectors=False, # not applicable as we read all categories
+    date_str=date_str,
+    pm2if_specifications=None, # automatically use the right specs for AI and NAI
+    default_gwp=None, # automatically uses right default GWP for AI and NAI
+    debug=False,
+)

+ 17 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py

@@ -0,0 +1,17 @@
+"""
+wrapper around read_crf_for_country_datalad such that it can be called
+from doit in the current setup where doit runs on system python and
+not in the venv.
+"""
+
+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
+    read_DI_for_country_datalad
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country name or code')
+
+args = parser.parse_args()
+country = args.country
+
+read_DI_for_country_datalad(country)

+ 19 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group.py

@@ -0,0 +1,19 @@
+"""
+This script is a wrapper around the read_UNFCCC_DI_for_country_group
+function such that it can be called from datalad
+"""
+
+import argparse
+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
+    read_UNFCCC_DI_for_country_group
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--annexI', help='read for AnnexI countries (default is for '
+                                     'non-AnnexI)', action='store_true')
+args = parser.parse_args()
+annexI = args.annexI
+
+read_UNFCCC_DI_for_country_group(
+    annexI=annexI,
+)

+ 19 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py

@@ -0,0 +1,19 @@
+"""
+wrapper around read_crf_for_country_datalad such that it can be called
+from doit in the current setup where doit runs on system python and
+not in the venv.
+"""
+
+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
+    read_DI_for_country_group_datalad
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--annexI', help='read for AnnexI countries (default is for '
+                                     'non-AnnexI)', action='store_true')
+args = parser.parse_args()
+annexI = args.annexI
+
+read_DI_for_country_group_datalad(
+    annexI=annexI,
+)

+ 13 - 0
UNFCCC_GHG_data/UNFCCC_DI_reader/util.py

@@ -0,0 +1,13 @@
+import unfccc_di_api
+
+reader = unfccc_di_api.UNFCCCApiReader()
+nAI_countries = list(reader.non_annex_one_reader.parties["code"])
+AI_countries = list(reader.annex_one_reader.parties["code"])
+
+DI_date_format = '%Y-%m-%d'
+regex_date = r"([0-9]{4}-[0-9]{2}-[0-9]{2})"
+
+class NoDIDataError(Exception):
+    pass
+
+

+ 5 - 0
UNFCCC_GHG_data/UNFCCC_downloader/__init__.py

@@ -0,0 +1,5 @@
+from .unfccc_submission_info import get_unfccc_submission_info
+
+__all__ = [
+    "get_unfccc_submission_info",
+]

+ 13 - 12
UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py

@@ -11,7 +11,7 @@ from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from pathlib import Path
 from pathlib import Path
 
 
-root = Path(__file__).parents[2]
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
 
 
 ###############
 ###############
 #
 #
@@ -77,12 +77,10 @@ else:
         "submissions/national-inventory-submissions-{}".format(year)
         "submissions/national-inventory-submissions-{}".format(year)
     )
     )
 
 
-download_path = root / "downloaded_data" / "UNFCCC"
-
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
 
 
 # Read submissions list
 # Read submissions list
-submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")
 
 
 # filter submissions list or category
 # filter submissions list or category
 items = submissions[submissions.Kind  == category.upper()]
 items = submissions[submissions.Kind  == category.upper()]
@@ -120,7 +118,7 @@ for idx, submission in items.iterrows():
     country = country.replace(' ', '_')
     country = country.replace(' ', '_')
     print(f"Downloading {title} from {url}")
     print(f"Downloading {title} from {url}")
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = \
     local_filename = \
@@ -136,7 +134,7 @@ for idx, submission in items.iterrows():
             os.remove(local_filename)
             os.remove(local_filename)
     
     
     # now we have removed error pages, so a present file should not be overwritten
     # now we have removed error pages, so a present file should not be overwritten
-    if not local_filename.exists():
+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
         i = 0  # reset counter
         i = 0  # reset counter
         while not local_filename.exists() and i < 10:
         while not local_filename.exists() and i < 10:
             # for i = 0 and i = 5 try to get a new session ID
             # for i = 0 and i = 5 try to get a new session ID
@@ -167,7 +165,7 @@ for idx, submission in items.iterrows():
             
             
         if local_filename.exists():
         if local_filename.exists():
             new_downloaded.append(submission)
             new_downloaded.append(submission)
-            print(f"Download => {local_filename.relative_to(root)}")
+            print(f"Download => {local_filename.relative_to(root_path)}")
             # unzip data (only for new downloads)
             # unzip data (only for new downloads)
             if local_filename.suffix == ".zip":
             if local_filename.suffix == ".zip":
                 try:
                 try:
@@ -177,18 +175,21 @@ for idx, submission in items.iterrows():
                     zipped_file.close()
                     zipped_file.close()
                 # TODO Better error logging/visibilty
                 # TODO Better error logging/visibilty
                 except zipfile.BadZipFile:
                 except zipfile.BadZipFile:
-                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
+                    print(f"Error while trying to extract "
+                          f"{local_filename.relative_to(root_path)}")
                 except NotImplementedError:
                 except NotImplementedError:
                     print("Zip format not supported, please unzip on the command line.")
                     print("Zip format not supported, please unzip on the command line.")
             else:
             else:
-                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
+                print(f"Not attempting to extract "
+                      f"{local_filename.relative_to(root_path)}.")
         else:
         else:
-            print(f"Failed to download {local_filename.relative_to(root)}")
+            print(f"Failed to download {local_filename.relative_to(root_path)}")
 
 
     else:
     else:
-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
 
 
 driver.close()
 driver.close()
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC
+          / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)

+ 14 - 11
UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py

@@ -3,11 +3,12 @@ import requests
 import shutil
 import shutil
 import time
 import time
 import os
 import os
+import re
 from datetime import date
 from datetime import date
 from random import randrange
 from random import randrange
-
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 from pathlib import Path
 from pathlib import Path
-root = Path(__file__).parents[2]
+
 """
 """
 based on download_bur from national-inventory-submissions
 based on download_bur from national-inventory-submissions
 # (https://github.com/openclimatedata/national-inventory-submisions)
 # (https://github.com/openclimatedata/national-inventory-submisions)
@@ -33,19 +34,20 @@ url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
 # TODO error page sizes are from BUR and NC and might differ for NDCs
 # TODO error page sizes are from BUR and NC and might differ for NDCs
 # if an error page is found instead of a pdf adjust sizes here
 # if an error page is found instead of a pdf adjust sizes here
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
+ndc_regex = r".*\s([A-Za-z]*)\sNDC"
 
 
 # Ensure download path and subfolders exist
 # Ensure download path and subfolders exist
-download_path = root / "downloaded_data" / "UNFCCC"
-if not download_path.exists():
-    download_path.mkdir(parents=True)
+if not downloaded_data_path_UNFCCC.exists():
+    downloaded_data_path_UNFCCC.mkdir(parents=True)
 
 
 new_downloaded = []
 new_downloaded = []
 
 
-
 for idx, submission in submissions.iterrows():
 for idx, submission in submissions.iterrows():
     print("=" * 60)
     print("=" * 60)
-    ndc = submission.Number
+    #ndc = submission.Number
     title = submission.Title
     title = submission.Title
+    temp = re.findall(ndc_regex, title)
+    ndc = temp[0]
     url = submission.EncodedAbsUrl
     url = submission.EncodedAbsUrl
     submission_date = submission.SubmissionDate
     submission_date = submission.SubmissionDate
     country = submission.Party
     country = submission.Party
@@ -54,12 +56,12 @@ for idx, submission in submissions.iterrows():
 
 
     ndc_folder = "NDC_" + ndc + "_" + submission_date
     ndc_folder = "NDC_" + ndc + "_" + submission_date
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = country_folder / ndc_folder / url.split('/')[-1]
     local_filename = country_folder / ndc_folder / url.split('/')[-1]
     local_filename_underscore = \
     local_filename_underscore = \
-        download_path / country / ndc_folder / \
+        downloaded_data_path_UNFCCC / country / ndc_folder / \
         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
     if not local_filename.parent.exists():
     if not local_filename.parent.exists():
         local_filename.parent.mkdir()
         local_filename.parent.mkdir()
@@ -73,7 +75,8 @@ for idx, submission in submissions.iterrows():
             os.remove(local_filename_underscore)
             os.remove(local_filename_underscore)
     
     
     # now we have to remove error pages, so a present file should not be overwritten
     # now we have to remove error pages, so a present file should not be overwritten
-    if not local_filename_underscore.exists():
+    if (not local_filename_underscore.exists()) \
+            and (not local_filename_underscore.is_symlink()):
         i = 0  # reset counter
         i = 0  # reset counter
         while not local_filename_underscore.exists() and i < 10:
         while not local_filename_underscore.exists() and i < 10:
 
 
@@ -102,4 +105,4 @@ for idx, submission in submissions.iterrows():
 
 
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)

+ 9 - 10
UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py

@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from pathlib import Path
 from pathlib import Path
-
-root = Path(__file__).parents[2]
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
 
 
 ###############
 ###############
 #
 #
@@ -45,8 +44,7 @@ else:
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
 
 
 # Read submissions list
 # Read submissions list
-download_path = root / "downloaded_data" / "UNFCCC"
-submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{category.lower()}.csv")
 
 
 # set options for headless mode
 # set options for headless mode
 profile_path = ".firefox"
 profile_path = ".firefox"
@@ -82,7 +80,7 @@ for idx, submission in submissions.iterrows():
     country = country.replace(' ', '_')
     country = country.replace(' ', '_')
     print(f"Downloading {title} from {url}")
     print(f"Downloading {title} from {url}")
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = \
     local_filename = \
@@ -98,7 +96,7 @@ for idx, submission in submissions.iterrows():
             os.remove(local_filename)
             os.remove(local_filename)
     
     
     # now we have removed error pages, so a present file should not be overwritten
     # now we have removed error pages, so a present file should not be overwritten
-    if not local_filename.exists():
+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
         i = 0  # reset counter
         i = 0  # reset counter
         while not local_filename.exists() and i < 10:
         while not local_filename.exists() and i < 10:
             # for i = 0 and i = 5 try to get a new session ID
             # for i = 0 and i = 5 try to get a new session ID
@@ -129,14 +127,15 @@ for idx, submission in submissions.iterrows():
             
             
         if local_filename.exists():
         if local_filename.exists():
             new_downloaded.append(submission)
             new_downloaded.append(submission)
-            print(f"Download => {local_filename.relative_to(root)}")
+            print(f"Download => {local_filename.relative_to(root_path)}")
         else:
         else:
-            print(f"Failed to download {local_filename.relative_to(root)}")
+            print(f"Failed to download {local_filename.relative_to(root_path)}")
 
 
     else:
     else:
-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
 
 
 driver.close()
 driver.close()
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC /
+          f"00_new_downloads_{category}-{date.today()}.csv", index=False)

+ 2 - 3
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py

@@ -8,8 +8,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from unfccc_submission_info import get_unfccc_submission_info
 from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 max_tries = 10
 max_tries = 10
 
 
@@ -143,4 +142,4 @@ if len(no_downloads) > 0:
 
 
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
-df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv", index=False)

+ 2 - 3
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py

@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from unfccc_submission_info import get_unfccc_submission_info
 from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 """
 """
 Download UNFCCC Biennial Update Report submissions
 Download UNFCCC Biennial Update Report submissions
@@ -84,4 +83,4 @@ if len(no_downloads) > 0:
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
 df = df[["Kind", "Country", "Title", "URL"]]
 df = df[["Kind", "Country", "Title", "URL"]]
-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "submissions-bur.csv", index=False)

+ 4 - 4
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py

@@ -8,9 +8,9 @@ from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
-from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from UNFCCC_GHG_data.UNFCCC_downloader import \
+    get_unfccc_submission_info
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 """
 """
 Download UNFCCC Biennial Update Report submissions
 Download UNFCCC Biennial Update Report submissions
@@ -85,4 +85,4 @@ if len(no_downloads) > 0:
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
 df = df[["Kind", "Country", "Title", "URL"]]
 df = df[["Kind", "Country", "Title", "URL"]]
-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "submissions-nc.csv", index=False)

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py

@@ -6,7 +6,7 @@ import sys
 import camelot
 import camelot
 import primap2 as pm2
 import primap2 as pm2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-from pathlib import Path
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
@@ -16,11 +16,6 @@ from pathlib import Path
 #  PRIMAP2 version
 #  PRIMAP2 version
 
 
 # folders and files
 # folders and files
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
                'BUR4'
                'BUR4'
 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py

@@ -5,9 +5,9 @@ import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import filter_data
 from primap2.pm2io._data_reading import filter_data
 
 
@@ -16,11 +16,6 @@ from primap2.pm2io._data_reading import filter_data
 # ###
 # ###
 
 
 # folders and files
 # folders and files
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py

@@ -4,19 +4,12 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py

@@ -4,21 +4,14 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import numpy as np
 import numpy as np
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py

@@ -3,20 +3,13 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 from config_MEX_BUR3 import page_defs, fix_rows
 from config_MEX_BUR3 import page_defs, fix_rows
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
 if not output_folder.exists():
 if not output_folder.exists():

+ 2 - 8
UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -5,25 +5,19 @@ import camelot
 import primap2 as pm2
 import primap2 as pm2
 import pandas as pd
 import pandas as pd
 import copy
 import copy
-from pathlib import Path
+
 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
     table_defs, header_defs
     table_defs, header_defs
 from primap2.pm2io._data_reading import matches_time_format, filter_data
 from primap2.pm2io._data_reading import matches_time_format, filter_data
+from UNFCCC_GHG_data.helper import extracted_data_path, downloaded_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
 output_filename = 'MAR_BUR3_2022_'
 output_filename = 'MAR_BUR3_2022_'
-
 inventory_file = 'Morocco_BUR3_Fr.pdf'
 inventory_file = 'Morocco_BUR3_Fr.pdf'
-
 gwp_to_use = 'AR4GWP100'
 gwp_to_use = 'AR4GWP100'
 
 
 # years to read
 # years to read

+ 2 - 9
UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py

@@ -1,28 +1,21 @@
-# this script reads data from Korea's 2021 national inventory
+# this script reads data from Korea's 2021 national inventory which is underlying BUR4
 # Data is read from the xlsx file
 # Data is read from the xlsx file
 
 
 import os
 import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
     filter_remove_after_agg
     filter_remove_after_agg
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
                '2021-Inventory'
                '2021-Inventory'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py

@@ -5,19 +5,14 @@ import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from primap2.pm2io._data_reading import filter_data
 from primap2.pm2io._data_reading import filter_data
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
                '2020-Inventory'
                '2020-Inventory'
 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py

@@ -3,11 +3,10 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import copy
 import copy
-#import re
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
 from config_TWN_NIR2022 import table_defs, page_defs
 from config_TWN_NIR2022 import table_defs, page_defs
@@ -17,12 +16,6 @@ from config_TWN_NIR2022 import gwp_to_use
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
 # TODO: move file to subfolder
 # TODO: move file to subfolder
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -1,23 +1,16 @@
 # this script reads data from Thailand's BUR3
 # this script reads data from Thailand's BUR3
 # Data is read from the pdf file
 # Data is read from the pdf file
-
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import copy
 import copy
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/__init__.py

@@ -1,6 +1 @@
-# expose some of the functions to the outside as they are used in other readers as well
-# TODO: create a unified util module for all readers
-
-from .get_submissions_info import get_country_code
-
-__all__ = ["get_country_code"]
+#

+ 3 - 437
UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py

@@ -5,324 +5,11 @@ from typing import List, Dict
 from pathlib import Path
 from pathlib import Path
 import json
 import json
 import pycountry
 import pycountry
-#import os
 
 
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
-# beware, folders below are different than for CRF reader
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-legacy_data_path = root_path / "legacy_data"
-
-# TODO: move this to general util package
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
-}
-
-custom_folders = {
-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
-    'Micronesia_(Federated_State_of)': 'FSM',
-    'Micronesia_(Federated_States_of)': 'FSM',
-    'The_Republic_of_North_Macedonia': 'MKD',
-    'Republic_of_Korea': 'KOR',
-    'Bolivia_(Plurinational_State_of)': 'BOL',
-    'Türkiye': 'TUR',
-    'Iran_(Islamic_Republic_of)': 'IRN',
-    'Côte_d’Ivoire': 'CIV',
-    'Democratic_Republic_of_the_Congo': "COD",
-    'European_Union': 'EUA',
-    'Taiwan': 'TWN',
-}
-
-def get_country_submissions(
-        country_name: str,
-        print_sub: bool = True,
-) -> Dict[str, List[str]]:
-    """
-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
-    queries the folder mapping files for folders.
-
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        print_sub: bool
-            If True information on submissions will be written to stdout
-
-    Returns
-    -------
-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
-        Each value is a list of folders
-
-    """
-
-    data_folder = downloaded_data_path
-
-    country_code = get_country_code(country_name)
-
-    if print_sub:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
-
-    country_submissions = {}
-    if print_sub:
-        print(f"#" * 80)
-        print(f"The following submissions are available for {country_name}")
-    for item in data_folder.iterdir():
-        if item.is_dir():
-            if print_sub:
-                print("")
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code in folder_mapping:
-                country_folders = folder_mapping[country_code]
-                if isinstance(country_folders, str):
-                    # only one folder
-                    country_folders = [country_folders]
-
-                submission_folders = []
-                for country_folder in country_folders:
-                    current_folder = item / country_folder
-                    if print_sub:
-                        print(f"Submissions in folder {country_folder}:")
-
-                    for submission_folder in current_folder.iterdir():
-                        if submission_folder.is_dir():
-                            if print_sub:
-                                print(submission_folder.name)
-                            submission_folders.append(submission_folder.name)
-
-                country_submissions[item.name] = submission_folders
-            else:
-                print(f"No submissions available for {country_name}.")
-
-    return country_submissions
-
-
-def get_country_datasets(
-        country_name: str,
-        print_ds: bool = True,
-) -> Dict[str, List[str]]:
-    """
-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the country's name.
-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
-    checks the UNFCCC_GHG_data and data folders for content on the country.
-
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        print_ds: bool
-            If True information on submissions will be written to stdout
-
-    Returns
-    -------
-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
-        Each value is a list of folders
-
-    """
-
-    data_folder = extracted_data_path
-    data_folder_legacy = legacy_data_path
-
-
-    # obtain country UNFCCC_GHG_data
-    country_code = get_country_code(country_name)
-
-    if print_ds:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper import get_country_code
 
 
-    rep_data = {}
-    # data
-    if print_ds:
-        print(f"#" * 80)
-        print(f"The following datasets are available for {country_name}")
-    for item in data_folder.iterdir():
-        if item.is_dir():
-            cleaned_datasets_current_folder = {}
-            if print_ds:
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code not in folder_mapping:
-                if print_ds:
-                    print("No data available")
-                    print("")
-            else:
-                country_folder = folder_mapping[country_code]
-                if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
-
-                datasets_current_folder = {}
-                current_folder = item / country_folder
-
-                for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
-                        if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
-                        else:
-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
-
-                for dataset in datasets_current_folder:
-                    # process filename to get submission
-                    parts = dataset.split('_')
-                    if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
-                    else:
-                        terminology = "_".join(parts[3 : ])
-                        key = f"{parts[1]} ({parts[2]}, {terminology})"
-                        data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
-                            data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
-                            data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF (.yaml), "
-
-                        code_file = get_code_file(country_code, parts[1])
-                        if code_file:
-                            data_info = data_info + f"UNFCCC_GHG_data: {code_file.name}"
-                        else:
-                            data_info = data_info + f"UNFCCC_GHG_data: not found"
-
-                        cleaned_datasets_current_folder[key] = data_info
-
-                if print_ds:
-                    if cleaned_datasets_current_folder:
-                        for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
-                    else:
-                        print("No data available")
-                    print("")
-
-            rep_data[item.name] = cleaned_datasets_current_folder
-
-    # legacy data
-    if print_ds:
-        print(f"#" * 80)
-        print(f"The following legacy datasets are available for {country_name}")
-    legacy_data = {}
-    for item in data_folder_legacy.iterdir():
-        if item.is_dir():
-            cleaned_datasets_current_folder = {}
-            if print_ds:
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code not in folder_mapping:
-                if print_ds:
-                    print("No data available")
-                    print("")
-            else:
-                country_folder = folder_mapping[country_code]
-                if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
-
-                datasets_current_folder = {}
-                current_folder = item / country_folder
-
-                for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
-                        if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
-                        else:
-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
-
-                for dataset in datasets_current_folder:
-                    # process filename to get submission
-                    parts = dataset.split('_')
-                    if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
-                    else:
-                        terminology = "_".join(parts[3 : ])
-                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
-                        data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
-                            data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
-                            data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF (.yaml), "
-
-                        cleaned_datasets_current_folder[key] = data_info
-
-                if print_ds:
-                    if cleaned_datasets_current_folder:
-                        for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
-                    else:
-                        print("No data available")
-                    print("")
-
-                legacy_data[item.name] = cleaned_datasets_current_folder
-
-    all_data = {
-        "rep_data": rep_data,
-        "legacy_data": legacy_data,
-    }
-
-    return all_data
-
-
-def get_country_code(
-        country_name: str,
-)->str:
-    """
-    obtain country UNFCCC_GHG_data. If the input is a UNFCCC_GHG_data it will be returned, if the input
-    is not a three letter UNFCCC_GHG_data a search will be performed
-
-    Parameters
-    __________
-    country_name: str
-        Country UNFCCC_GHG_data or name to get the three-letter UNFCCC_GHG_data for.
-
-    """
-    # First check if it's in the list of custom codes
-    if country_name in custom_country_mapping:
-        country_code = country_name
-    else:
-        try:
-            # check if it's a 3 letter UNFCCC_GHG_data
-            country = pycountry.countries.get(alpha_3=country_name)
-            country_code = country.alpha_3
-        except:
-            try:
-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
-            except:
-                raise ValueError(f"Country name {country_name} can not be mapped to "
-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
-            if len(country) > 1:
-                country_code = None
-                for current_country in country:
-                    if current_country.name == country_name:
-                        country_code = current_country.alpha_3
-                if country_code is None:
-                    raise ValueError(f"Country name {country_name} has {len(country)} "
-                                     f"possible results for country codes.")
-
-            country_code = country[0].alpha_3
-
-    return country_code
+code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
 
 
 
 
 def get_possible_inputs(
 def get_possible_inputs(
@@ -446,128 +133,7 @@ def get_possible_outputs(
     return output_files
     return output_files
 
 
 
 
-def get_code_file(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
-) -> Path:
-    """
-    For given country name and submission find the script that creates the data
 
 
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        submission: str
-            String of the submission
-
-        print_info: bool = False
-            If True print information on UNFCCC_GHG_data found
-
-    Returns
-    -------
-        returns a pathlib Path object for the UNFCCC_GHG_data file
-    """
-
-    code_file_path = None
-
-    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
-    # so we return the path to that.
-    if submission[0:3] == "CRF":
-        return root_path / "UNFCCC_CRF_reader"
 
 
-    # obtain country UNFCCC_GHG_data
-    country_code = get_country_code(country_name)
-
-    if print_info:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
-
-    with open(code_path / "folder_mapping.json", "r") as mapping_file:
-        folder_mapping = json.load(mapping_file)
-
-    if country_code not in folder_mapping:
-        if print_info:
-            print("No UNFCCC_GHG_data available")
-            print("")
-    else:
-        country_folder = code_path / folder_mapping[country_code]
-        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
-
-        for file in country_folder.iterdir():
-            if file.match(code_file_name_candidate):
-                if code_file_path is not None:
-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
-                                     f"{code_file_path} and file.name. "
-                                     f"Please use only one file with name "
-                                     f"'read_ISO3_submission_XXX.YYY'.")
-                else:
-                    if print_info:
-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
-                code_file_path = file
-
-    if code_file_path is not None:
-        return code_file_path.relative_to(root_path)
-    else:
-        return None
-
-
-def create_folder_mapping(
-        folder: str,
-        extracted: bool = False
-) -> None:
-    """
-    Create a mapping from 3 letter ISO country codes to folders
-    based on the subfolders of the given folder. The mapping is
-    stored in 'folder_mapping.json' in the given folder. Folder
-    must be given relative to the repository root
 
 
-    Parameters
-    ----------
-        folder: str
-            folder to create the mapping for
-        extracted: bool = False
-            If true treat the folder as extracted data, where we
-            only have one folder per country and no typos in the
-            names
-
-    Returns
-    -------
-        Nothing
-
-    """
 
 
-    folder = root_path / folder
-    folder_mapping = {}
-    #if not extracted:
-    known_folders = custom_folders
-    #else:
-    #    known_folders = {}
-
-    for item in folder.iterdir():
-        if item.is_dir() and not item.match("__pycache__"):
-            if item.name in known_folders:
-                ISO3 = known_folders[item.name]
-            else:
-                try:
-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
-                    if len(country) > 1:
-                        ISO3 = None
-                        for current_country in country:
-                            if current_country.name == item.name.replace("_", " "):
-                                ISO3 = current_country.alpha_3
-                    else:
-                        ISO3 = country[0].alpha_3
-                except:
-                    ISO3 = None
-
-            if ISO3 is None:
-                print(f"No match for {item.name}")
-            else:
-                if ISO3 in folder_mapping.keys():
-                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
-                else:
-                    folder_mapping[ISO3] = item.name
-
-    with open(folder / "folder_mapping.json", "w") as mapping_file:
-        json.dump(folder_mapping, mapping_file, indent=4)

+ 3 - 9
UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py

@@ -1,15 +1,12 @@
 # this script takes submission and country as input (from doit) and
 # this script takes submission and country as input (from doit) and
 # runs the appropriate script to extract the submission data
 # runs the appropriate script to extract the submission data
 
 
-import sys
 import datalad.api
 import datalad.api
-from pathlib import Path
 import argparse
 import argparse
 from get_submissions_info import get_code_file
 from get_submissions_info import get_code_file
 from get_submissions_info import get_possible_inputs
 from get_submissions_info import get_possible_inputs
 from get_submissions_info import get_possible_outputs
 from get_submissions_info import get_possible_outputs
-
-
+from UNFCCC_GHG_data.helper import root_path
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.
@@ -22,9 +19,6 @@ args = parser.parse_args()
 country = args.country
 country = args.country
 submission = args.submission
 submission = args.submission
 
 
-codepath = Path(__file__).parent
-rootpath = codepath / ".." / ".."
-rootpath = rootpath.resolve()
 
 
 print(f"Attempting to extract data for {submission} from {country}.")
 print(f"Attempting to extract data for {submission} from {country}.")
 print("#"*80)
 print("#"*80)
@@ -49,7 +43,7 @@ if script_name is not None:
         print("")
         print("")
     # make input files absolute to avoid datalad confusions when
     # make input files absolute to avoid datalad confusions when
     # root directory is via symlink
     # root directory is via symlink
-    input_files = [rootpath / file for file in input_files]
+    input_files = [root_path / file for file in input_files]
     # convert file's path to str
     # convert file's path to str
     input_files = [file.as_posix() for file in input_files]
     input_files = [file.as_posix() for file in input_files]
 
 
@@ -69,7 +63,7 @@ if script_name is not None:
     print(f"Run the script using datalad run via the python api")
     print(f"Run the script using datalad run via the python api")
     datalad.api.run(
     datalad.api.run(
         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
-        dataset=rootpath,
+        dataset=root_path,
         message=f"Read data for {country}, {submission}.",
         message=f"Read data for {country}, {submission}.",
         inputs=input_files,
         inputs=input_files,
         outputs=output_files,
         outputs=output_files,

+ 6 - 1
UNFCCC_GHG_data/__init__.py

@@ -2,7 +2,12 @@
 
 
 from . import UNFCCC_reader
 from . import UNFCCC_reader
 from . import UNFCCC_CRF_reader
 from . import UNFCCC_CRF_reader
+from . import helper
 # import UNFCCC_DI_reader
 # import UNFCCC_DI_reader
 # import UNFCCC_downloader
 # import UNFCCC_downloader
 
 
-__all__ = ["UNFCCC_reader", "UNFCCC_CRF_reader"]
+__all__ = [
+    "UNFCCC_reader",
+    "UNFCCC_CRF_reader",
+    "helper",
+]

+ 27 - 0
UNFCCC_GHG_data/helper/__init__.py

@@ -0,0 +1,27 @@
+from .definitions import root_path, code_path, log_path
+from .definitions import extracted_data_path, extracted_data_path_UNFCCC
+from .definitions import legacy_data_path
+from .definitions import downloaded_data_path, downloaded_data_path_UNFCCC
+from .definitions import dataset_path, dataset_path_UNFCCC
+from .definitions import custom_country_mapping, custom_folders
+from .functions import get_country_code, get_country_name, convert_categories
+from .functions import create_folder_mapping
+
+__all__ = [
+    "root_path",
+    "code_path",
+    "log_path",
+    "extracted_data_path",
+    "extracted_data_path_UNFCCC",
+    "legacy_data_path",
+    "downloaded_data_path",
+    "downloaded_data_path_UNFCCC",
+    "dataset_path",
+    "dataset_path_UNFCCC",
+    "custom_country_mapping",
+    "custom_folders",
+    "get_country_code",
+    "get_country_name",
+    "convert_categories",
+    "create_folder_mapping",
+]

+ 2 - 2
UNFCCC_GHG_data/UNFCCC_reader/country_info.py → UNFCCC_GHG_data/helper/country_info.py

@@ -2,8 +2,8 @@
 # runs displays available submissions and datasets
 # runs displays available submissions and datasets
 
 
 import argparse
 import argparse
-from get_submissions_info import get_country_submissions
-from get_submissions_info import get_country_datasets
+from UNFCCC_GHG_data.helper.functions import get_country_submissions
+from UNFCCC_GHG_data.helper.functions import get_country_datasets
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.

+ 49 - 0
UNFCCC_GHG_data/helper/definitions.py

@@ -0,0 +1,49 @@
+import os
+from pathlib import Path
+
+
+def get_root_path() -> Path:
+    """ get the root_path from an environment variable """
+    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
+    if root_path_env is None:
+        raise ValueError('UNFCCC_GHG_ROOT_PATH environment variable needs to be set')
+    else:
+        root_path = Path(root_path_env).resolve()
+    return root_path
+
+root_path = get_root_path()
+code_path = root_path / "UNFCCC_GHG_data"
+log_path = root_path / "log"
+extracted_data_path = root_path / "extracted_data"
+extracted_data_path_UNFCCC = extracted_data_path / "UNFCCC"
+downloaded_data_path = root_path / "downloaded_data"
+downloaded_data_path_UNFCCC = downloaded_data_path / "UNFCCC"
+legacy_data_path = root_path / "legacy_data"
+dataset_path = root_path / "datasets"
+dataset_path_UNFCCC = dataset_path / "UNFCCC"
+
+
+custom_country_mapping = {
+    "EUA": "European Union",
+    "EUC": "European Union",
+    "FRK": "France",
+    "DKE": "Denmark",
+    "DNM": "Denmark",
+    "GBK": "United Kingdom of Great Britain and Northern Ireland",
+}
+
+custom_folders = {
+    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
+    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
+    'Micronesia_(Federated_State_of)': 'FSM',
+    'Micronesia_(Federated_States_of)': 'FSM',
+    'The_Republic_of_North_Macedonia': 'MKD',
+    'Republic_of_Korea': 'KOR',
+    'Bolivia_(Plurinational_State_of)': 'BOL',
+    'Türkiye': 'TUR',
+    'Iran_(Islamic_Republic_of)': 'IRN',
+    'Côte_d’Ivoire': 'CIV',
+    'Democratic_Republic_of_the_Congo': "COD",
+    'European_Union': 'EUA',
+    'Taiwan': 'TWN',
+}

+ 1 - 1
UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py → UNFCCC_GHG_data/helper/folder_mapping.py

@@ -3,7 +3,7 @@
 # oir that folder
 # oir that folder
 
 
 import argparse
 import argparse
-from get_submissions_info import create_folder_mapping
+from UNFCCC_GHG_data.helper import create_folder_mapping
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.

+ 510 - 0
UNFCCC_GHG_data/helper/functions.py

@@ -0,0 +1,510 @@
+import pycountry
+import json
+import xarray as xr
+from copy import deepcopy
+from typing import Dict, List
+from pathlib import Path
+from .definitions import custom_country_mapping, custom_folders
+from .definitions import root_path, downloaded_data_path, extracted_data_path
+from .definitions import legacy_data_path, code_path
+
+
+def convert_categories(
+        ds_input: xr.Dataset,
+        conversion: Dict[str, Dict[str, str]],
+        #terminology_from: str,
+        terminology_to: str,
+        debug: bool=False,
+        tolerance: float=0.01,
+)->xr.Dataset:
+    """
+    convert data from one category terminology to another
+    """
+    ds_converted = ds_input.copy(deep=True)
+    ds_converted.attrs = deepcopy(ds_input.attrs)
+
+    # change category terminology
+    cat_dim = ds_converted.attrs["cat"]
+    ds_converted.attrs["cat"] = f"category ({terminology_to})"
+    ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
+
+    # find categories present in dataset
+    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
+
+    # restrict categories and map category names
+    if 'mapping' in conversion.keys():
+        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
+                                cat in cats_present]
+        ds_converted = ds_converted.pr.loc[
+            {'category': mapping_cats_present}]
+
+        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
+        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
+        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
+                                                   (f'category ({terminology_to})',
+                                                    to_cats)})
+
+    # redo the list of present cats after mapping, as we have new categories in the
+    # target terminology now
+    cats_present_mapped = list(ds_converted.coords[f'category ({terminology_to})'])
+    # aggregate categories
+    if 'aggregate' in conversion:
+        aggregate_cats = conversion['aggregate']
+        for cat_to_agg in aggregate_cats:
+            if debug:
+                print(f"Category: {cat_to_agg}")
+            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
+                           cat in cats_present_mapped]
+            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
+                dim='category', skipna=True, min_count=1)
+            nan_vars = [var for var in data_agg.data_vars if
+                        data_agg[var].isnull().all().data == True]
+            data_agg = data_agg.drop(nan_vars)
+            if len(data_agg.data_vars) > 0:
+                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
+                data_agg = data_agg.assign_coords(
+                    coords={f'category ({terminology_to})':
+                                (f'category ({terminology_to})', [cat_to_agg])})
+                ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
+            else:
+                print(f"no data to aggregate category {cat_to_agg}")
+
+    return ds_converted
+
+
+def get_country_name(
+        country_code: str,
+) -> str:
+    """get country name from code """
+    if country_code in custom_country_mapping:
+        country_name = custom_country_mapping[country_code]
+    else:
+        try:
+            country = pycountry.countries.get(alpha_3=country_code)
+            country_name = country.name
+        except:
+            raise ValueError(f"Country code {country_code} can not be mapped to "
+                             f"any country")
+
+    return country_name
+
+
+def get_country_code(
+        country_name: str,
+)->str:
+    """
+    obtain country code. If the input is a code it will be returned,
+    if the input
+    is not a three letter code a search will be performed
+
+    Parameters
+    __________
+    country_name: str
+        Country code or name to get the three-letter code for.
+
+    Returns
+    -------
+        country_code: str
+
+    """
+    # First check if it's in the list of custom codes
+    if country_name in custom_country_mapping:
+        country_code = country_name
+    else:
+        try:
+            # check if it's a 3 letter UNFCCC_GHG_data
+            country = pycountry.countries.get(alpha_3=country_name)
+            country_code = country.alpha_3
+        except:
+            try:
+                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
+            except:
+                raise ValueError(f"Country name {country_name} can not be mapped to "
+                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
+            if len(country) > 1:
+                country_code = None
+                for current_country in country:
+                    if current_country.name == country_name:
+                        country_code = current_country.alpha_3
+                if country_code is None:
+                    raise ValueError(f"Country name {country_name} has {len(country)} "
+                                     f"possible results for country codes.")
+
+            country_code = country[0].alpha_3
+
+    return country_code
+
+
+def create_folder_mapping(
+        folder: str,
+        extracted: bool = False
+) -> None:
+    """
+    Create a mapping from 3 letter ISO country codes to folders
+    based on the subfolders of the given folder. The mapping is
+    stored in 'folder_mapping.json' in the given folder. Folder
+    must be given relative to the repository root
+
+    Parameters
+    ----------
+        folder: str
+            folder to create the mapping for
+        extracted: bool = False
+            If true treat the folder as extracted data, where we
+            only have one folder per country and no typos in the
+            names
+
+    Returns
+    -------
+        Nothing
+
+    """
+
+    folder = root_path / folder
+    folder_mapping = {}
+    #if not extracted:
+    known_folders = custom_folders
+    #else:
+    #    known_folders = {}
+
+    for item in folder.iterdir():
+        if item.is_dir() and not item.match("__pycache__"):
+            if item.name in known_folders:
+                ISO3 = known_folders[item.name]
+            else:
+                try:
+                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
+                    if len(country) > 1:
+                        ISO3 = None
+                        for current_country in country:
+                            if current_country.name == item.name.replace("_", " "):
+                                ISO3 = current_country.alpha_3
+                    else:
+                        ISO3 = country[0].alpha_3
+                except:
+                    ISO3 = None
+
+            if ISO3 is None:
+                print(f"No match for {item.name}")
+            else:
+                if ISO3 in folder_mapping.keys():
+                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
+                else:
+                    folder_mapping[ISO3] = item.name
+
+    with open(folder / "folder_mapping.json", "w") as mapping_file:
+        json.dump(folder_mapping, mapping_file, indent=4)
+
+
+# TODO add crf
+def get_country_submissions(
+        country_name: str,
+        print_sub: bool = True,
+) -> Dict[str, List[str]]:
+    """
+    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
+    queries the folder mapping files for folders.
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
+
+        print_sub: bool
+            If True information on submissions will be written to stdout
+
+    Returns
+    -------
+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
+        Each value is a list of folders
+
+    """
+
+    data_folder = downloaded_data_path
+
+    country_code = get_country_code(country_name)
+
+    if print_sub:
+        print(f"Country name {country_name} maps to ISO code {country_code}")
+
+    country_submissions = {}
+    if print_sub:
+        print(f"#" * 80)
+        print(f"The following submissions are available for {country_name}")
+    for item in data_folder.iterdir():
+        if item.is_dir():
+            if print_sub:
+                print("")
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code in folder_mapping:
+                country_folders = folder_mapping[country_code]
+                if isinstance(country_folders, str):
+                    # only one folder
+                    country_folders = [country_folders]
+
+                submission_folders = []
+                for country_folder in country_folders:
+                    current_folder = item / country_folder
+                    if print_sub:
+                        print(f"Submissions in folder {country_folder}:")
+
+                    for submission_folder in current_folder.iterdir():
+                        if submission_folder.is_dir():
+                            if print_sub:
+                                print(submission_folder.name)
+                            submission_folders.append(submission_folder.name)
+
+                country_submissions[item.name] = submission_folders
+            else:
+                print(f"No submissions available for {country_name}.")
+
+    return country_submissions
+
+
+def get_country_datasets(
+        country_name: str,
+        print_ds: bool = True,
+) -> Dict[str, List[str]]:
+    """
+    Input is a three letter ISO code for a country, or the country's name.
+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
+    checks the UNFCCC_GHG_data and data folders for content on the country.
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter code
+
+        print_ds: bool
+            If True information on submissions will be written to stdout
+
+    Returns
+    -------
+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
+        Each value is a list of folders
+
+    """
+
+    data_folder = extracted_data_path
+    data_folder_legacy = legacy_data_path
+
+    # obtain country UNFCCC_GHG_data
+    country_code = get_country_code(country_name)
+
+    if print_ds:
+        print(f"Country name {country_name} maps to ISO code {country_code}")
+
+    rep_data = {}
+    # data
+    if print_ds:
+        print(f"#" * 80)
+        print(f"The following datasets are available for {country_name}")
+    for item in data_folder.iterdir():
+        if item.is_dir():
+            cleaned_datasets_current_folder = {}
+            if print_ds:
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code not in folder_mapping:
+                if print_ds:
+                    print("No data available")
+                    print("")
+            else:
+                country_folder = folder_mapping[country_code]
+                if not isinstance(country_folder, str):
+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+
+                datasets_current_folder = {}
+                current_folder = item / country_folder
+
+                for data_file in current_folder.iterdir():
+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                        if data_file.stem in datasets_current_folder:
+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                        else:
+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
+
+                for dataset in datasets_current_folder:
+                    # process filename to get submission
+                    parts = dataset.split('_')
+                    if parts[0] != country_code:
+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
+                            dataset
+                    else:
+                        terminology = "_".join(parts[3 : ])
+                        key = f"{parts[1]} ({parts[2]}, {terminology})"
+                        data_info = ""
+                        if '.nc' in datasets_current_folder[dataset]:
+                            data_info = data_info + "NF (.nc), "
+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                            data_info = data_info + "IF (.yaml + .csv), "
+                        elif '.csv' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF? (.csv), "
+                        elif '.yaml' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF (.yaml), "
+
+                        code_file = get_code_file(country_code, parts[1])
+                        if code_file:
+                            data_info = data_info + f"code: {code_file.name}"
+                        else:
+                            data_info = data_info + f"code: not found"
+
+                        cleaned_datasets_current_folder[key] = data_info
+
+                if print_ds:
+                    if cleaned_datasets_current_folder:
+                        for country_ds in cleaned_datasets_current_folder:
+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                    else:
+                        print("No data available")
+                    print("")
+
+            rep_data[item.name] = cleaned_datasets_current_folder
+
+    # legacy data
+    if print_ds:
+        print(f"#" * 80)
+        print(f"The following legacy datasets are available for {country_name}")
+    legacy_data = {}
+    for item in data_folder_legacy.iterdir():
+        if item.is_dir():
+            cleaned_datasets_current_folder = {}
+            if print_ds:
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code not in folder_mapping:
+                if print_ds:
+                    print("No data available")
+                    print("")
+            else:
+                country_folder = folder_mapping[country_code]
+                if not isinstance(country_folder, str):
+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+
+                datasets_current_folder = {}
+                current_folder = item / country_folder
+
+                for data_file in current_folder.iterdir():
+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                        if data_file.stem in datasets_current_folder:
+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                        else:
+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
+
+                for dataset in datasets_current_folder:
+                    # process filename to get submission
+                    parts = dataset.split('_')
+                    if parts[0] != country_code:
+                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
+                    else:
+                        terminology = "_".join(parts[3 : ])
+                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
+                        data_info = ""
+                        if '.nc' in datasets_current_folder[dataset]:
+                            data_info = data_info + "NF (.nc), "
+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                            data_info = data_info + "IF (.yaml + .csv), "
+                        elif '.csv' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF? (.csv), "
+                        elif '.yaml' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF (.yaml), "
+
+                        cleaned_datasets_current_folder[key] = data_info
+
+                if print_ds:
+                    if cleaned_datasets_current_folder:
+                        for country_ds in cleaned_datasets_current_folder:
+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                    else:
+                        print("No data available")
+                    print("")
+
+                legacy_data[item.name] = cleaned_datasets_current_folder
+
+    all_data = {
+        "rep_data": rep_data,
+        "legacy_data": legacy_data,
+    }
+
+    return all_data
+
+
+def get_code_file(
+        country_name: str,
+        submission: str,
+        print_info: bool = False,
+) -> Path:
+    """
+    For given country name and submission find the script that creates the data
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
+
+        submission: str
+            String of the submission
+
+        print_info: bool = False
+            If True print information on UNFCCC_GHG_data found
+
+    Returns
+    -------
+        returns a pathlib Path object for the UNFCCC_GHG_data file
+    """
+
+    code_file_path = None
+    UNFCCC_reader_path = code_path / "UNFCCC_reader"
+
+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
+    # so we return the path to that.
+    if submission[0:3] == "CRF":
+        return root_path / "UNFCCC_CRF_reader"
+
+    if submission[0:2] == "DI":
+        return root_path / "UNFCCC_DI_reader"
+
+    # obtain country UNFCCC_GHG_data
+    country_code = get_country_code(country_name)
+
+    if print_info:
+        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
+
+    with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code not in folder_mapping:
+        if print_info:
+            print("No UNFCCC_GHG_data available")
+            print("")
+    else:
+        country_folder = UNFCCC_reader_path / folder_mapping[country_code]
+        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
+
+        for file in country_folder.iterdir():
+            if file.match(code_file_name_candidate):
+                if code_file_path is not None:
+                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
+                                     f"{code_file_path} and file.name. "
+                                     f"Please use only one file with name "
+                                     f"'read_ISO3_submission_XXX.YYY'.")
+                else:
+                    if print_info:
+                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
+                code_file_path = file
+
+    if code_file_path is not None:
+        return code_file_path.relative_to(root_path)
+    else:
+        return None

+ 1 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv

+ 1 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc

+ 1 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml

+ 1 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/21/Q3/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv

+ 1 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/Xk/7m/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc

+ 29 - 0
datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml

@@ -0,0 +1,29 @@
+attrs:
+  references: https://di.unfccc.int
+  title: 'Data submitted by the following AnnexI countries and available in the DI
+    interface on 2023-05-24: AUS, AUT, BEL, BGR, BLR'
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (CRFDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - category (CRFDI)
+  - provenance
+  - measure
+  - scenario (Access_Date)
+  - source
+  - class
+  - area (ISO3)
+  - entity
+  - unit
+data_file: DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/kJ/1w/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/p0/9q/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc

+ 40 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml

@@ -0,0 +1,40 @@
+attrs:
+  references: https://di.unfccc.int
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+  title: 'Data submitted by the following non-AnnexI countries and available in the
+    DI interface on 2023-05-24: AFG, AGO, ALB, ARE, ARG, ARM, ATG, AZE, BDI, BEN,
+    BFA, BGD, BHR, BHS, BIH, BLZ, BOL, BRA, BRB, BRN, BTN, BWA, CAF, CHL, CHN, CIV,
+    CMR, COD, COG, COK, COL, COM, CPV, CRI, CUB, DJI, DMA, DOM, DZA, ECU, EGY, ERI,
+    ETH, FJI, FSM, GAB, GEO, GHA, GIN, GMB, GNB, GRD, GTM, GUY, HND, HTI, IDN, IND,
+    IRN, IRQ, ISR, JAM, JOR, KEN, KGZ, KHM, KIR, KNA, KOR, KWT, LAO, LBN, LBR, LCA,
+    LKA, LSO, MAR, MDA, MDG, MDV, MEX, MHL, MKD, MLI, MMR, MNE, MNG, MOZ, MRT, MUS,
+    MWI, MYS, NAM, NER, NGA, NIC, NIU, NPL, NRU, OMN, PAK, PAN, PER, PHL, PLW, PNG,
+    PRK, PRY, PSE, QAT, RWA, SAU, SDN, SEN, SGP, SLB, SLV, SMR, SRB, SSD, STP, SUR,
+    SWZ, SYC, SYR, TCD, TGO, THA, TJK, TKM, TLS, TON, TTO, TUN, TUV, TZA, UGA, URY,
+    UZB, VCT, VEN, VNM, VUT, WSM, YEM, ZAF, ZMB, ZWE'
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - area (ISO3)
+  - class
+  - provenance
+  - measure
+  - source
+  - scenario (Access_Date)
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/Pg/q5/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv

+ 1 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/x0/z5/MD5E-s250365--f718595d2232f23579164612e71192c7.nc/MD5E-s250365--f718595d2232f23579164612e71192c7.nc

+ 29 - 0
datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.yaml

@@ -0,0 +1,29 @@
+attrs:
+  references: https://di.unfccc.int
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - measure
+  - class
+  - scenario (Access_Date)
+  - area (ISO3)
+  - provenance
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv

+ 6 - 0
datasets/UNFCCC/no_updates_until.txt

@@ -0,0 +1,6 @@
+22/08/30: data is identical to 22/08/22
+22/09/26: data is identical to 22/08/22
+22/10/13: data is identical to 22/08/22
+22/10/29: data updates for BHS, LKA, MDA (just 0 instead of nan), MYS, PAK (new data), PRY (0 instead of nan), TCD (new data)
+23/01/23: data is identical to 22/10/29
+

+ 94 - 9
dodo.py

@@ -1,5 +1,6 @@
 # define tasks for UNFCCC data repository
 # define tasks for UNFCCC data repository
 from doit import get_var
 from doit import get_var
+import os
 
 
 # TODO: task for folder mapping
 # TODO: task for folder mapping
 
 
@@ -18,6 +19,18 @@ def task_setup_venv():
         'verbosity': 2,
         'verbosity': 2,
     }
     }
 
 
+# set UNFCCC_GHG_ROOT_PATH environment variable
+def task_set_env():
+    """
+    Set the environment variable for the module so data is stored in the correct folders
+    """
+    def set_root_path():
+        os.environ["UNFCCC_GHG_ROOT_PATH"] = "."
+
+    return {
+        'actions': [set_root_path],
+    }
+
 
 
 # Task to create the mapping files which map folder names to ISO 3-letter country codes
 # Task to create the mapping files which map folder names to ISO 3-letter country codes
 read_config_folder = {
 read_config_folder = {
@@ -29,8 +42,9 @@ def task_map_folders():
     Create or update the folder mapping in the given folder
     Create or update the folder mapping in the given folder
     """
     """
     return {
     return {
-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder={read_config_folder['folder']}"],
                     f"--folder={read_config_folder['folder']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -44,6 +58,7 @@ def task_update_bur():
         'actions': ['datalad run -m "Fetch BUR submissions" '
         'actions': ['datalad run -m "Fetch BUR submissions" '
                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -58,9 +73,10 @@ def task_download_bur():
         'actions': ['datalad run -m "Download BUR submissions" '
         'actions': ['datalad run -m "Download BUR submissions" '
                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -73,6 +89,7 @@ def task_update_nc():
         'actions': ['datalad run -m "Fetch NC submissions" '
         'actions': ['datalad run -m "Fetch NC submissions" '
                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -87,9 +104,10 @@ def task_download_nc():
         'actions': ['datalad run -m "Download NC submissions" '
         'actions': ['datalad run -m "Download NC submissions" '
                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -112,6 +130,7 @@ def task_update_annexi():
                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
                     f"--year={update_aI_config['year']}"],
                     f"--year={update_aI_config['year']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -128,9 +147,10 @@ def task_download_annexi():
                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -141,9 +161,10 @@ def task_download_ndc():
     return {
     return {
         'actions': ['datalad run -m "Download NDC submissions" '
         'actions': ['datalad run -m "Download NDC submissions" '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -163,9 +184,10 @@ def task_read_unfccc_submission():
     return {
     return {
         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
                     f"--country={read_config['country']} --submission={read_config['submission']}",
                     f"--country={read_config['country']} --submission={read_config['submission']}",
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=extracted_data/UNFCCC"
                     f"--folder=extracted_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -188,13 +210,14 @@ def task_read_unfccc_crf_submission():
         f"--country={read_config_crf['country']} "
         f"--country={read_config_crf['country']} "
         f"--submission_year={read_config_crf['submission_year']} "
         f"--submission_year={read_config_crf['submission_year']} "
         f"--submission_date={read_config_crf['submission_date']} ",
         f"--submission_date={read_config_crf['submission_date']} ",
-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
         f"--folder=extracted_data/UNFCCC"
         f"--folder=extracted_data/UNFCCC"
         ]
         ]
     if read_config_crf["re_read"] == "True":
     if read_config_crf["re_read"] == "True":
         actions[0] = actions[0] + " --re_read"
         actions[0] = actions[0] + " --re_read"
     return {
     return {
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -205,7 +228,7 @@ def task_read_new_unfccc_crf_for_year():
     data not present yet. Only reads the latest updated submission for each country."""
     data not present yet. Only reads the latest updated submission for each country."""
     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
                f"--submission_year={read_config_crf['submission_year']} ",
                f"--submission_year={read_config_crf['submission_year']} ",
-               f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+               f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                f"--folder=extracted_data/UNFCCC"
                f"--folder=extracted_data/UNFCCC"
                ]
                ]
     # specifying countries is currently disabled duo to problems with command line
     # specifying countries is currently disabled duo to problems with command line
@@ -217,17 +240,79 @@ def task_read_new_unfccc_crf_for_year():
     return {
     return {
         #'basename': "Read_CRF_year",
         #'basename': "Read_CRF_year",
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+# tasks for DI reader
+# datalad run is called from within the read_UNFCCC_DI_for_country.py script
+read_config_di = {
+    "country": get_var('country', None),
+    "date": get_var('date', None),
+    "annexI": get_var('annexI', False),
+    #"countries": get_var('countries', None),
+}
+
+def task_read_unfccc_di_for_country():
+    """ Read DI data for a country """
+    actions = [
+        f"./venv/bin/python "
+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py "
+        f"--country={read_config_di['country']}",
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
+        f"--folder=extracted_data/UNFCCC"
+        ]
+    return {
+        'actions': actions,
+        'task_dep': ['set_env'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+def task_process_unfccc_di_for_country():
+    """ Process DI data for a country """
+    actions = [
+        f"./venv/bin/python "
+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py "
+        f"--country={read_config_di['country']} --date={read_config_di['date']}",
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
+        f"--folder=extracted_data/UNFCCC"
+        ]
+    return {
+        'actions': actions,
+        'task_dep': ['set_env'],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
+def task_read_unfccc_di_for_country_group():
+    """ Read DI data for a country """
+    actions = [
+        f"./venv/bin/python "
+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py",
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
+        f"--folder=extracted_data/UNFCCC"
+        ]
+    if read_config_di["annexI"] == "True":
+        actions[0] = actions[0] + " --annexI"
+
+    return {
+        'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
 
 
 
 
+# general tasks
 def task_country_info():
 def task_country_info():
     """ Print information on submissions and datasets
     """ Print information on submissions and datasets
     available for given country"""
     available for given country"""
     return {
     return {
-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/country_info.py "
+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/country_info.py "
                     f"--country={read_config['country']}"],
                     f"--country={read_config['country']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }

+ 1 - 0
downloaded_data/UNFCCC/00_new_downloads_BUR-2023-05-25.csv

@@ -0,0 +1 @@
+../../.git/annex/objects/xK/JK/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv

+ 1 - 0
downloaded_data/UNFCCC/00_new_downloads_CRF2023-2023-05-25.csv

@@ -0,0 +1 @@
+../../.git/annex/objects/19/v1/MD5E-s301--5c7bf2049ef674fc24e8b6cd815d3ec5.csv/MD5E-s301--5c7bf2049ef674fc24e8b6cd815d3ec5.csv

+ 1 - 0
downloaded_data/UNFCCC/00_new_downloads_NC-2023-05-26.csv

@@ -0,0 +1 @@
+../../.git/annex/objects/v2/00/MD5E-s729--a132ceae9eb74eef0dfa35ff35e934f7.csv/MD5E-s729--a132ceae9eb74eef0dfa35ff35e934f7.csv

+ 1 - 0
downloaded_data/UNFCCC/Bosnia_and_Herzegovina/BUR3/TBUR_BiH_Oct__2022_ENG.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/vm/Pz/MD5E-s2296920--54671aebed597ce15ba33412130564a9.pdf/MD5E-s2296920--54671aebed597ce15ba33412130564a9.pdf

+ 1 - 0
downloaded_data/UNFCCC/Bosnia_and_Herzegovina/NC4/FNC_BiH_ENG_fin.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/m7/V6/MD5E-s8376835--b8cc7123b927e5cfce50b3997039ac21.pdf/MD5E-s8376835--b8cc7123b927e5cfce50b3997039ac21.pdf

+ 1 - 0
downloaded_data/UNFCCC/Micronesia_(Federated_States_of)/BUR1/NC3_BUR1_MICRONESIA_UNFCCC.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/V2/z6/MD5E-s13644421--5a8de76f99ee7a2bdabad7c064d4d986.pdf/MD5E-s13644421--5a8de76f99ee7a2bdabad7c064d4d986.pdf

+ 1 - 0
downloaded_data/UNFCCC/Nicaragua/NC4/4CN-Nicaragua.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/g1/9g/MD5E-s22028489--551918780966395e2e73c30eaf7954f2.pdf/MD5E-s22028489--551918780966395e2e73c30eaf7954f2.pdf

+ 1 - 0
downloaded_data/UNFCCC/Niger/BUR1/92876103_Niger-BUR1-1-PREMIER_RAPPORT_BIENNAL_ACTUALISE_DU_NIGER.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/pm/VQ/MD5E-s5077565--a2fb630d0a395c7b722eb561cfff230b.pdf/MD5E-s5077565--a2fb630d0a395c7b722eb561cfff230b.pdf

+ 1 - 0
downloaded_data/UNFCCC/Niger/BUR1/RIN_BUR-2022_VF_11-07-2022_FINAL.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/6f/0J/MD5E-s4137398--9e680b56495af7b48ed59f8cb8d4655a.pdf/MD5E-s4137398--9e680b56495af7b48ed59f8cb8d4655a.pdf

File diff suppressed because it is too large
+ 1413 - 0
downloaded_data/UNFCCC/North_Macedonia/NC4/EN%2C_IV_NCCC.pdf


+ 1 - 0
downloaded_data/UNFCCC/North_Macedonia/NIR/IV_Inventory_report.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/1Z/qM/MD5E-s10485798--57047f5fcb4ac3892a5acefbdf20fded.pdf/MD5E-s10485798--57047f5fcb4ac3892a5acefbdf20fded.pdf

+ 1 - 0
downloaded_data/UNFCCC/Somalia/BUR1/Somalia_First_BUR_report_2022.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/fj/j8/MD5E-s4034845--018ffe4c39a6fd2a6f7b5de7551f42d0.pdf/MD5E-s4034845--018ffe4c39a6fd2a6f7b5de7551f42d0.pdf

+ 1 - 0
downloaded_data/UNFCCC/Suriname/NC3/SURINAME_NC3_2023_FINAL.pdf

@@ -0,0 +1 @@
+../../../../.git/annex/objects/FF/mm/MD5E-s10184040--43ba0348a79943ea2f4427f6b6e81812.pdf/MD5E-s10184040--43ba0348a79943ea2f4427f6b6e81812.pdf

+ 1 - 1
downloaded_data/UNFCCC/submissions-bur.csv

@@ -1 +1 @@
-../../.git/annex/objects/9P/37/MD5E-s47375--6ca4662bb7e57d404617b66ed3028050.csv/MD5E-s47375--6ca4662bb7e57d404617b66ed3028050.csv
+../../.git/annex/objects/Kp/jz/MD5E-s47947--27f4ac07d714bdca02d0187343ddb51b.csv/MD5E-s47947--27f4ac07d714bdca02d0187343ddb51b.csv

+ 1 - 1
downloaded_data/UNFCCC/submissions-nc.csv

@@ -1 +1 @@
-../../.git/annex/objects/FX/QJ/MD5E-s79887--1e9d67b4a0ea171590ffe9c8ac09465d.csv/MD5E-s79887--1e9d67b4a0ea171590ffe9c8ac09465d.csv
+../../.git/annex/objects/xj/z0/MD5E-s80513--bc265a60b581084f38e4e5a2284d026a.csv/MD5E-s80513--bc265a60b581084f38e4e5a2284d026a.csv

+ 1 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc

+ 1 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml

+ 1 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/Gg/2X/MD5E-s18919--a42a1fee0e4c3c610714557a75d1620e.csv/MD5E-s18919--a42a1fee0e4c3c610714557a75d1620e.csv

+ 1 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/pQ/86/MD5E-s114204--457a97cd84140839af57080e201d1551.nc/MD5E-s114204--457a97cd84140839af57080e201d1551.nc

+ 31 - 0
extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml

@@ -0,0 +1,31 @@
+attrs:
+  references: https://di.unfccc.int
+  title: Data submitted to the UNFCCC by country Afghanistan as available in the DI
+    interface on 2023-05-24.
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - measure
+  - class
+  - scenario (Access_Date)
+  - area (ISO3)
+  - provenance
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc

+ 1 - 0
extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml

+ 1 - 0
extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/Pq/v4/MD5E-s85461--155b0c24d8922433f5b988c6854f9b35.csv/MD5E-s85461--155b0c24d8922433f5b988c6854f9b35.csv

+ 1 - 0
extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/Gk/FK/MD5E-s245028--6d75bea846c53fc0d8fa41c553bb4ca7.nc/MD5E-s245028--6d75bea846c53fc0d8fa41c553bb4ca7.nc

+ 31 - 0
extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml

@@ -0,0 +1,31 @@
+attrs:
+  references: https://di.unfccc.int
+  title: Data submitted to the UNFCCC by country Albania as available in the DI interface
+    on 2023-05-24.
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - measure
+  - class
+  - scenario (Access_Date)
+  - area (ISO3)
+  - provenance
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/j6/v7/MD5E-s33876--617014378d61f3ba6fc2e5ce0d1b95d8.csv/MD5E-s33876--617014378d61f3ba6fc2e5ce0d1b95d8.csv

+ 1 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/g2/Vm/MD5E-s142168--86942819046fe2819092c4679f870d37.nc/MD5E-s142168--86942819046fe2819092c4679f870d37.nc

+ 31 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml

@@ -0,0 +1,31 @@
+attrs:
+  references: https://di.unfccc.int
+  title: Data submitted to the UNFCCC by country Algeria as available in the DI interface
+    on 2023-05-24.
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - measure
+  - class
+  - scenario (Access_Date)
+  - area (ISO3)
+  - provenance
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc

+ 1 - 0
extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml

+ 1 - 0
extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.csv

@@ -0,0 +1 @@
+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv

+ 1 - 0
extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.nc

@@ -0,0 +1 @@
+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc

+ 1 - 0
extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.yaml

@@ -0,0 +1 @@
+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml

+ 1 - 0
extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv

@@ -0,0 +1 @@
+../../../.git/annex/objects/g7/pX/MD5E-s10020--fe0830fefd7d17c22e2bdcd904eac64c.csv/MD5E-s10020--fe0830fefd7d17c22e2bdcd904eac64c.csv

+ 1 - 0
extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc

@@ -0,0 +1 @@
+../../../.git/annex/objects/km/k4/MD5E-s102299--8f7f0b75284302c6ac533ded03fec0b7.nc/MD5E-s102299--8f7f0b75284302c6ac533ded03fec0b7.nc

+ 31 - 0
extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml

@@ -0,0 +1,31 @@
+attrs:
+  references: https://di.unfccc.int
+  title: Data submitted to the UNFCCC by country Angola as available in the DI interface
+    on 2023-05-24.
+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
+    read on 2023-05-24.
+  rights: ''
+  contact: mail@johannes-guetschow.de
+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
+  cat: category (BURDI)
+  area: area (ISO3)
+  scen: scenario (Access_Date)
+  sec_cats:
+  - class
+  - measure
+time_format: '%Y'
+dimensions:
+  '*':
+  - time
+  - source
+  - measure
+  - class
+  - scenario (Access_Date)
+  - area (ISO3)
+  - provenance
+  - category (BURDI)
+  - entity
+  - unit
+additional_coordinates:
+  orig_cat_name: category (BURDI)
+data_file: AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv

Some files were not shown because too many files changed in this diff