il y a 1 an · 532334647f
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
@@ -9,20 +9,10 @@ submission are available in the downloaded data folder.
 
															 # TODO: integrate into doit
														
 
															 import argparse
														
 
															-import sys
														
 
															 import primap2 as pm2
														
 
															 from pathlib import Path
														
 
															 from datetime import date
														
 
															-
														
 
															-root_path = Path(__file__).parents[2].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-#log_path = root_path / "log"
														
 
															-code_path = root_path / "UNFCCC_GHG_data"
														
 
															-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
														
 
															-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
														
 
															-dataset_path = root_path / "datasets" / "UNFCCC"
														
 
															-
														
 
															-#sys.path.append(code_path.name)
														
 
															+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
														
 
															 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
														
 
															 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
														
@@ -81,7 +71,7 @@ for country in all_crf_countries:
 
															 today = date.today()
														
 
															 compression = dict(zlib=True, complevel=9)
														
 
															-output_folder = dataset_path / f"CRF{submission_year}"
														
 
															+output_folder = dataset_path_UNFCCC / f"CRF{submission_year}"
														
 
															 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
@@ -8,9 +8,7 @@ import re
 
															 import json
														
 
															 import numpy as np
														
 
															 import pandas as pd
														
 
															-import xarray as xr
														
 
															 import primap2 as pm2
														
 
															-import pycountry
														
 
															 from pathlib import Path
														
 
															 from treelib import Tree
														
 
															 from operator import itemgetter
														
@@ -18,8 +16,8 @@ from collections import Counter
 
															 from typing import Dict, List, Optional, Tuple, Union
														
 
															 from datetime import datetime, timedelta
														
 
															 from . import crf_specifications as crf
														
 
															-from .util import downloaded_data_path, NoCRFFilesError, custom_country_mapping
														
 
															-
														
 
															+from .util import NoCRFFilesError
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
														
 
															 ### reading functions
														
 
															 def convert_crf_table_to_pm2if(
														
@@ -568,7 +566,7 @@ def get_crf_files(
 
															     # we should only have files for one country and submission in the folder. But the
														
 
															     # function can also be used on a given folder and then the filter is useful.
														
 
															     if folder is None:
														
 
															-        data_folder = downloaded_data_path
														
 
															+        data_folder = downloaded_data_path_UNFCCC
														
 
															         submission_folder = f"CRF{submission_year}"
														
 
															         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
														
@@ -935,7 +933,7 @@ def get_latest_date_for_country(
 
															         str: string with date
														
 
															     """
														
 
															-    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
														
 
															+    with open(downloaded_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
														
 
															         folder_mapping = json.load(mapping_file)
														
 
															     if country_code in folder_mapping:
														
@@ -946,12 +944,12 @@ def get_latest_date_for_country(
 
															         if isinstance(country_folders, str):
														
 
															             # only one folder
														
 
															             submission_date = find_latest_date(get_submission_dates(
														
 
															-                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
														
 
															+                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
														
 
															         else:
														
 
															             dates = []
														
 
															             for folder in country_folders:
														
 
															                 dates = dates + get_submission_dates(
														
 
															-                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
														
 
															+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}", file_filter)
														
 
															             submission_date = find_latest_date(dates)
														
 
															     else:
														
 
															         raise ValueError(f"No data folder found for country {country_code}. "
														
@@ -1059,19 +1057,3 @@ def find_latest_date(
 
															     return dates_datetime[-1][0]
														
 
															-
														
 
															-def get_country_name(
														
 
															-        country_code: str,
														
 
															-) -> str:
														
 
															-    """get country name from UNFCCC_GHG_data """
														
 
															-    if country_code in custom_country_mapping:
														
 
															-        country_name = custom_country_mapping[country_code]
														
 
															-    else:
														
 
															-        try:
														
 
															-            country = pycountry.countries.get(alpha_3=country_code)
														
 
															-            country_name = country.name
														
 
															-        except:
														
 
															-            raise ValueError(f"Country UNFCCC_GHG_data {country_code} can not be mapped to "
														
 
															-                             f"any country")
														
 
															-
														
 
															-    return country_name
														
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
@@ -13,11 +13,9 @@ from typing import List, Optional
 
															 from pathlib import Path
														
 
															 from datetime import date
														
 
															-
														
 
															 from .util import all_crf_countries
														
 
															-from .util import log_path
														
 
															+from UNFCCC_GHG_data.helper import log_path, get_country_name
														
 
															 from . import crf_specifications as crf
														
 
															-from .UNFCCC_CRF_reader_core import get_country_name
														
 
															 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
														
 
															 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
														
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
@@ -20,18 +20,16 @@ from .UNFCCC_CRF_reader_core import read_crf_table
 
															 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
														
 
															 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
														
 
															 from .UNFCCC_CRF_reader_core import get_crf_files
														
 
															-from .UNFCCC_CRF_reader_core import get_country_name
														
 
															 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
														
 
															 from .UNFCCC_CRF_reader_devel import save_last_row_info
														
 
															-from .util import code_path, log_path, \
														
 
															-    custom_country_mapping, extracted_data_path, root_path, \
														
 
															-    all_crf_countries, NoCRFFilesError
														
 
															+from UNFCCC_GHG_data.helper import code_path, log_path, root_path
														
 
															+from UNFCCC_GHG_data.helper import custom_country_mapping, extracted_data_path_UNFCCC
														
 
															+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
														
 
															+from .util import all_crf_countries, NoCRFFilesError
														
 
															 #import sys
														
 
															 #sys.path.append(code_path.name)
														
 
															-from ..UNFCCC_reader import get_country_code
														
 
															-
														
 
															 # functions:
														
 
															 # * testing fucntions
														
@@ -42,8 +40,6 @@ from ..UNFCCC_reader import get_country_code
 
															 # TODO: add function to read several / all countries
														
 
															-
														
 
															-
														
 
															 # general approach:
														
 
															 # main UNFCCC_GHG_data in a function that reads on table from one file.
														
 
															 # return raw pandas DF for use in different functions
														
@@ -188,7 +184,7 @@ def read_crf_for_country(
 
															         if save_data:
														
 
															             compression = dict(zlib=True, complevel=9)
														
 
															-            output_folder = extracted_data_path / country_name.replace(" ", "_")
														
 
															+            output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
														
 
															             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
														
 
															             if not output_folder.exists():
														
@@ -476,7 +472,7 @@ def get_input_and_output_files_for_country(
 
															     country_info["input"] = input_files
														
 
															     # get output file
														
 
															-    output_folder = extracted_data_path / country_name.replace(" ", "_")
														
 
															+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
														
 
															     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
														
 
															                                     f"_{submission_date}.{suffix}" for suffix
														
 
															                     in ['yaml', 'csv', 'nc']]
														
@@ -505,7 +501,7 @@ def submission_has_been_read(
 
															     """
														
 
															     Check if a CRF submission has already been read
														
 
															     """
														
 
															-    output_folder = extracted_data_path / country_name.replace(" ", "_")
														
 
															+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
														
 
															     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
														
 
															     if output_folder.exists():
														
 
															         existing_files = output_folder.glob(f"{output_filename}.*")
														
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
@@ -1,23 +1,3 @@
 
															-from pathlib import Path
														
 
															-
														
 
															-# 4 for use from nbs, fix
														
 
															-root_path = Path(__file__).parents[2].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-log_path = root_path / "log"
														
 
															-code_path = root_path / "UNFCCC_GHG_data"
														
 
															-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
														
 
															-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
														
 
															-
														
 
															-# TODO: move this to a more general location as we can't import it
														
 
															-# to get_submissions_info
														
 
															-custom_country_mapping = {
														
 
															-    "EUA": "European Union",
														
 
															-    "EUC": "European Union",
														
 
															-    "FRK": "France",
														
 
															-    "DKE": "Denmark",
														
 
															-    "DNM": "Denmark",
														
 
															-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
														
 
															-}
														
 
															 all_crf_countries = [
														
 
															     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
														
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
@@ -24,11 +24,14 @@ from .UNFCCC_DI_reader_config import di_query_filters
 
															 from .UNFCCC_DI_reader_config import di_processing_info
														
 
															 from .UNFCCC_DI_reader_config import cat_conversion
														
 
															 from .UNFCCC_DI_reader_config import gas_baskets
														
 
															-from .util import NoDIDataError, get_country_name, get_country_code
														
 
															-from .util import nAI_countries, AI_countries, custom_country_mapping
														
 
															-from .util import code_path, root_path, extracted_data_path
														
 
															+from .util import NoDIDataError, nAI_countries, AI_countries
														
 
															 from .util import DI_date_format, regex_date
														
 
															+from UNFCCC_GHG_data.helper import custom_country_mapping
														
 
															+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
														
 
															+from UNFCCC_GHG_data.helper import extracted_data_path_UNFCCC, root_path, code_path
														
 
															+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
														
 
															+
														
 
															 def read_UNFCCC_DI_for_country(
														
 
															         country_code: str,
														
@@ -612,7 +615,8 @@ def convert_DI_data_to_pm2_if(
 
															     if date_str == "country":
														
 
															         pm2if_specifications["coords_defaults"]["scenario"] = f"DIrolling"
														
 
															     elif date_str is None:
														
 
															-        date_str = str(date.today())
														
 
															+        today = date.today()
														
 
															+        date_str = today.strftime(DI_date_format)
														
 
															     pm2if_specifications["coords_defaults"]["scenario"] = f"DI{date_str}"
														
 
															     # set metadata
														
@@ -737,7 +741,7 @@ def save_DI_country_data(
 
															     # get the filename with the hash and check if it exists (separate for pm2 format
														
 
															     # and IF to fix broken datasets if necessary)
														
 
															-    filename_hash = determine_filename(country_code, token, raw, hash=True)
														
 
															+    filename_hash = root_path / determine_filename(country_code, token, raw, hash=True)
														
 
															     # primap2 native format
														
 
															     filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
														
@@ -761,7 +765,79 @@ def save_DI_country_data(
 
															         print(f"Data unchanged for {country_code}. Create symlinks.")
														
 
															     # get the filename with the date
														
 
															-    filename_date = determine_filename(country_code, date_str, raw)
														
 
															+    filename_date = root_path / determine_filename(country_code, date_str, raw)
														
 
															+
														
 
															+    # create the symlinks to the actual data (with the hash)
														
 
															+    suffixes = ['.nc', '.csv', '.yaml']
														
 
															+    for suffix in suffixes:
														
 
															+        file_date = filename_date.parent / (filename_date.name + suffix)
														
 
															+        file_hash = filename_hash.name + suffix
														
 
															+        if file_date.exists():
														
 
															+            file_date.unlink()
														
 
															+        file_date.symlink_to(file_hash)
														
 
															+
														
 
															+
														
 
															+def save_DI_dataset(
														
 
															+        data_pm2: xr.Dataset,
														
 
															+        raw: bool=True,
														
 
															+        non_AnnexI: bool=True,
														
 
															+):
														
 
															+    '''
														
 
															+    save primap2 and IF data to dataset folder
														
 
															+    can be used for raw and processed data but not to save to country folders
														
 
															+    '''
														
 
															+
														
 
															+    # preparations
														
 
															+    data_if = data_pm2.pr.to_interchange_format()
														
 
															+    if non_AnnexI:
														
 
															+        country_group = "non-AnnexI"
														
 
															+    else:
														
 
															+        country_group = "AnnexI"
														
 
															+
														
 
															+    ## get timestamp
														
 
															+    scenario_col = data_pm2.attrs['scen']
														
 
															+    scenarios = data_if[scenario_col].unique()
														
 
															+    if len(scenarios) > 1:
														
 
															+        raise ValueError(f"More than one scenario in input data. This function can only"
														
 
															+                         f"handle single scenario data. Scenarios: {scenarios}")
														
 
															+    else:
														
 
															+        scenario = scenarios[0]
														
 
															+
														
 
															+    date_str = scenario[2:]
														
 
															+
														
 
															+    # calculate the hash of the data to see if it's identical to present data
														
 
															+    data_for_token = data_if.drop(columns=[scenario_col])
														
 
															+    token = tokenize(data_for_token)
														
 
															+
														
 
															+    # get the filename with the hash and check if it exists (separate for pm2 format
														
 
															+    # and IF to fix broken datasets if necessary)
														
 
															+    filename_hash = determine_dataset_filename(token, raw, non_AnnexI=non_AnnexI,
														
 
															+                                               hash=True)
														
 
															+    # primap2 native format
														
 
															+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
														
 
															+    if not filename_hash_nc.exists():
														
 
															+        # if parent dir does not exist create it
														
 
															+        # TODO double, also in determine_dataset_filename. same for country data
														
 
															+        if not filename_hash.parent.exists():
														
 
															+            filename_hash.parent.mkdir()
														
 
															+        # save the data
														
 
															+        print(f"Data has changed. Save to {filename_hash_nc.name}")
														
 
															+        compression = dict(zlib=True, complevel=9)
														
 
															+        encoding = {var: compression for var in data_pm2.data_vars}
														
 
															+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
														
 
															+
														
 
															+    # primap2 IF
														
 
															+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
														
 
															+    if not filename_hash_csv.exists():
														
 
															+        # save the data
														
 
															+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
														
 
															+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
														
 
															+    else:
														
 
															+        print(f"Data unchanged for {country_group}. Create symlinks.")
														
 
															+
														
 
															+    # get the filename with the date
														
 
															+    filename_date = determine_dataset_filename(date_str, raw=raw,
														
 
															+                                               non_AnnexI=non_AnnexI, hash=False)
														
 
															     # create the symlinks to the actual data (with the hash)
														
 
															     suffixes = ['.nc', '.csv', '.yaml']
														
@@ -773,6 +849,59 @@ def save_DI_country_data(
 
															         file_date.symlink_to(file_hash)
														
 
															+## functions for multiple country reading
														
 
															+def read_UNFCCC_DI_for_all_countries(
														
 
															+        non_AnnexI: bool=True,
														
 
															+) -> xr.Dataset:
														
 
															+    '''
														
 
															+    This function reads DI data for all countries in a group (annexI or non-AnnexI)
														
 
															+    TODO: currently only non-annexI is implemented
														
 
															+    The function reads all data in one go using datalad run. as the output data file
														
 
															+    names are unknown beforehand datalad run uses explicit=false
														
 
															+    TODO: decide if dataset creation goes in here as well. Makes sense, I think. Then
														
 
															+    the function can return the xarray dataset
														
 
															+    '''
														
 
															+
														
 
															+    today = date.today()
														
 
															+    date_str = today.strftime(DI_date_format)
														
 
															+
														
 
															+    if non_AnnexI:
														
 
															+        countries = nAI_countries
														
 
															+    else:
														
 
															+        raise ValueError("Bulk reading for AnnexI countries not implemented yet")
														
 
															+
														
 
															+    # read the data
														
 
															+    data_all = None
														
 
															+    for country in countries[0:5]:
														
 
															+        print(f"reading DI data for country {country}")
														
 
															+
														
 
															+        try:
														
 
															+            data_country = read_UNFCCC_DI_for_country(
														
 
															+                country_code=country,
														
 
															+                category_groups=None,  # read all categories
														
 
															+                read_subsectors=False,  # not applicable as we read all categories
														
 
															+                date_str=date_str,
														
 
															+                pm2if_specifications=None,
														
 
															+                # automatically use the right specs for AI and NAI
														
 
															+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
														
 
															+                debug=False)
														
 
															+
														
 
															+            if data_all is None:
														
 
															+                data_all = data_country
														
 
															+            else:
														
 
															+                data_all = data_all.pr.merge(data_country)
														
 
															+        except unfccc_di_api.NoDataError as err:
														
 
															+            print(f"No data for {country}.")
														
 
															+            print(err)
														
 
															+
														
 
															+    # TODO: write metadata
														
 
															+
														
 
															+    # save the data
														
 
															+    #save_DI_dataset(data_all, raw=True, non_AnnexI=non_AnnexI)
														
 
															+
														
 
															+    return data_all
														
 
															+
														
 
															+
														
 
															 ## datalad and pydoit interface functions
														
 
															 def read_DI_for_country_datalad(
														
 
															         country: str,
														
@@ -790,7 +919,8 @@ def read_DI_for_country_datalad(
 
															     """
														
 
															     # get date to determine output filename
														
 
															-    date_str = str(date.today())
														
 
															+    today = date.today()
														
 
															+    date_str = today.strftime(DI_date_format)
														
 
															     # get all the info for the country
														
 
															     country_info = get_input_and_output_files_for_country_DI(country, date_str,
														
@@ -815,7 +945,7 @@ def read_DI_for_country_datalad(
 
															             inputs=country_info["input"],
														
 
															             outputs=country_info["output"],
														
 
															             dry_run=None,
														
 
															-            explicit=True,
														
 
															+            explicit=False,
														
 
															         )
														
 
															     except IncompleteResultsError as IRE:
														
 
															         print(f"IncompleteResultsError occured when running {cmd}: {IRE}")
														
@@ -865,7 +995,7 @@ def process_DI_for_country_datalad(
 
															             inputs=country_info["input"],
														
 
															             outputs=country_info["output"],
														
 
															             dry_run=None,
														
 
															-            explicit=True,
														
 
															+            explicit=False,
														
 
															         )
														
 
															     except IncompleteResultsError as IRE:
														
 
															         print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
														
@@ -874,15 +1004,21 @@ def process_DI_for_country_datalad(
 
															         print(ex.message)
														
 
															-## helper functions
														
 
															-
														
 
															+def read_DI_for_all_countries_datalad(
														
 
															+        non_AnnexI: bool=True,
														
 
															+):
														
 
															+    '''
														
 
															+    This function calls datalad run to read all data in one go. as the output data file
														
 
															+    names are unknown beforehand datalad run uses explicit=false
														
 
															+    '''
														
 
															+## helper functions
														
 
															 def determine_filename(
														
 
															         country_code: str,
														
 
															         date_or_hash: str,
														
 
															         raw: bool=False,
														
 
															         hash: bool=False,
														
 
															-)->Path:
														
 
															+) -> Path:
														
 
															     """
														
 
															     Determine the filename for a dataset from given country code and date string.
														
@@ -891,10 +1027,11 @@ def determine_filename(
 
															     ----------
														
 
															     country_code: str
														
 
															         ISO 3 letter code of the country
														
 
															-    date_str:
														
 
															+    date_or_hash:
														
 
															         formatted date string
														
 
															-    raw:
														
 
															+    raw: bool
														
 
															         bool specifying if filename fow raw or processed data should be returned
														
 
															+    hash: str
														
 
															     Returns
														
 
															     _______
														
@@ -903,7 +1040,7 @@ def determine_filename(
 
															     """
														
 
															     # get the country folder
														
 
															-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
														
 
															+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
														
 
															         folder_mapping = json.load(mapping_file)
														
 
															     if country_code in folder_mapping:
														
@@ -912,14 +1049,14 @@ def determine_filename(
 
															         country_folders = folder_mapping[country_code]
														
 
															         if isinstance(country_folders, str):
														
 
															             # only one folder
														
 
															-            country_folder = extracted_data_path / country_folders
														
 
															+            country_folder = extracted_data_path_UNFCCC / country_folders
														
 
															         else:
														
 
															             raise ValueError("More than one output folder for country "
														
 
															                              f"{country_code}. This should not happen.")
														
 
															     else:
														
 
															         # folder not in mapping. It will be created if not present yet
														
 
															         country_name = get_country_name(country_code)
														
 
															-        country_folder = extracted_data_path / country_name.replace(" ", "_")
														
 
															+        country_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
														
 
															         if country_folder.exists():
														
 
															            print(f"Output folder {country_name.replace(' ', '_')} for country "
														
@@ -938,6 +1075,50 @@ def determine_filename(
 
															     return filename.relative_to(root_path)
														
 
															+def determine_dataset_filename(
														
 
															+        date_or_hash: str,
														
 
															+        raw: bool=False,
														
 
															+        non_AnnexI: bool=True,
														
 
															+        hash: bool = False,
														
 
															+) -> Path:
														
 
															+    """
														
 
															+    Determine the filename for a dataset from given country group and date string.
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+    date_or_hash:
														
 
															+        formatted date string
														
 
															+    raw: bool
														
 
															+        bool specifying if filename fow raw or processed data should be returned
														
 
															+    non_AnnexI: bool
														
 
															+        True if non-AnnexI False if AnnexI
														
 
															+    hash: str
														
 
															+
														
 
															+    Returns
														
 
															+    _______
														
 
															+        pathlib Path object for the file name (without suffix)
														
 
															+    """
														
 
															+
														
 
															+    # get the country folder
														
 
															+    if non_AnnexI:
														
 
															+        current_dataset_path = dataset_path_UNFCCC / "DI_non_AnnexI"
														
 
															+        filename = f"DI_non_AnnexI_{date_or_hash}"
														
 
															+    else:
														
 
															+        current_dataset_path = dataset_path_UNFCCC / "DI_AnnexI"
														
 
															+        filename = f"DI_AnnexI_{date_or_hash}"
														
 
															+
														
 
															+    if not current_dataset_path.exists():
														
 
															+        current_dataset_path.mkdir()
														
 
															+
														
 
															+    if raw:
														
 
															+        filename = f"{filename}_raw"
														
 
															+    if hash:
														
 
															+        filename = f"{filename}_hash"
														
 
															+    filename = current_dataset_path / filename
														
 
															+
														
 
															+    return filename.relative_to(root_path)
														
 
															+
														
 
															+
														
 
															 def convert_categories(
														
 
															         ds_input: xr.Dataset,
														
 
															         conversion: Dict[str, Dict[str, str]],
														
@@ -1090,7 +1271,7 @@ def get_present_hashes_for_country_DI(
 
															         regex_hash = regex_hash + "hash\.nc"
														
 
															     # get the country folder
														
 
															-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
														
 
															+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
														
 
															         folder_mapping = json.load(mapping_file)
														
 
															     if country_code in folder_mapping:
														
@@ -1099,7 +1280,7 @@ def get_present_hashes_for_country_DI(
 
															         country_folders = folder_mapping[country_code]
														
 
															         if isinstance(country_folders, str):
														
 
															             # only one folder
														
 
															-            country_folder = extracted_data_path / country_folders
														
 
															+            country_folder = extracted_data_path_UNFCCC / country_folders
														
 
															         else:
														
 
															             raise ValueError("More than one output folder for country "
														
 
															                              f"{country_code}. This should not happen.")
														
@@ -1135,7 +1316,7 @@ def find_latest_DI_data(
 
															         regex = regex_date + r"\.nc"
														
 
															     # get the country folder
														
 
															-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
														
 
															+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
														
 
															         folder_mapping = json.load(mapping_file)
														
 
															     if country_code in folder_mapping:
														
@@ -1144,7 +1325,7 @@ def find_latest_DI_data(
 
															         country_folders = folder_mapping[country_code]
														
 
															         if isinstance(country_folders, str):
														
 
															             # only one folder
														
 
															-            country_folder = extracted_data_path / country_folders
														
 
															+            country_folder = extracted_data_path_UNFCCC / country_folders
														
 
															         else:
														
 
															             raise ValueError("More than one output folder for country "
														
 
															                              f"{country_code}. This should not happen.")
														
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
@@ -5,7 +5,9 @@ from .UNFCCC_DI_reader_core import \
 
															     read_UNFCCC_DI_for_country, read_DI_for_country_datalad, \
														
 
															     process_UNFCCC_DI_for_country, process_and_save_UNFCCC_DI_for_country, \
														
 
															     process_DI_for_country_datalad, \
														
 
															-    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename
														
 
															+    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename, \
														
 
															+    read_UNFCCC_DI_for_all_countries
														
 
															+
														
 
															 __all__ = [
														
@@ -17,4 +19,5 @@ __all__ = [
 
															     "convert_DI_data_to_pm2_if",
														
 
															     "convert_DI_IF_data_to_pm2",
														
 
															     "determine_filename",
														
 
															+    "read_UNFCCC_DI_for_all_countries",
														
 
															 ]
														
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
@@ -1,17 +1,6 @@
 
															-from pathlib import Path
														
 
															 import unfccc_di_api
														
 
															-# imports for copied functions
														
 
															-import pycountry
														
 
															-
														
 
															-root_path = Path(__file__).parents[2].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-log_path = root_path / "log"
														
 
															-code_path = root_path / "UNFCCC_GHG_data"
														
 
															-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
														
 
															-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
														
 
															 reader = unfccc_di_api.UNFCCCApiReader()
														
 
															-
														
 
															 nAI_countries = list(reader.non_annex_one_reader.parties["code"])
														
 
															 AI_countries = list(reader.annex_one_reader.parties["code"])
														
@@ -22,67 +11,3 @@ class NoDIDataError(Exception):
 
															     pass
														
 
															-# the following is copied from other sub-packages
														
 
															-# TODO: move these functions to common location to allow easy importing into all modules
														
 
															-custom_country_mapping = {
														
 
															-    "EUA": "European Union",
														
 
															-    "EUC": "European Union",
														
 
															-    "FRK": "France",
														
 
															-    "DKE": "Denmark",
														
 
															-    "DNM": "Denmark",
														
 
															-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
														
 
															-}
														
 
															-
														
 
															-
														
 
															-def get_country_name(
														
 
															-        country_code: str,
														
 
															-) -> str:
														
 
															-    """get country name from code """
														
 
															-    if country_code in custom_country_mapping:
														
 
															-        country_name = custom_country_mapping[country_code]
														
 
															-    else:
														
 
															-        try:
														
 
															-            country = pycountry.countries.get(alpha_3=country_code)
														
 
															-            country_name = country.name
														
 
															-        except:
														
 
															-            raise ValueError(f"Country code {country_code} can not be mapped to "
														
 
															-                             f"any country")
														
 
															-
														
 
															-    return country_name
														
 
															-
														
 
															-
														
 
															-def get_country_code(
														
 
															-        country_name: str,
														
 
															-)->str:
														
 
															-    """
														
 
															-    obtain country code. If the input is a code it will be returned, if the input
														
 
															-    is not a three letter code a search will be performed
														
 
															-
														
 
															-    Parameters
														
 
															-    __________
														
 
															-    country_name: str
														
 
															-        Country code or name to get the three-letter code for.
														
 
															-
														
 
															-    """
														
 
															-    try:
														
 
															-        # check if it's a 3 letter code
														
 
															-        country = pycountry.countries.get(alpha_3=country_name)
														
 
															-        country_code = country.alpha_3
														
 
															-    except:
														
 
															-        try:
														
 
															-            country = pycountry.countries.search_fuzzy(country_name)
														
 
															-        except:
														
 
															-            raise ValueError(f"Country name {country_name} can not be mapped to "
														
 
															-                             f"any country code")
														
 
															-        if len(country) > 1:
														
 
															-            country_code = None
														
 
															-            for current_country in country:
														
 
															-                if current_country.name == country_name:
														
 
															-                    country_code = current_country.alpha_3
														
 
															-            if country_code is None:
														
 
															-                raise ValueError(f"Country name {country_name} has {len(country)} "
														
 
															-                                 f"possible results for country codes.")
														
 
															-
														
 
															-        country_code = country[0].alpha_3
														
 
															-
														
 
															-    return country_code
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
@@ -11,7 +11,7 @@ from selenium.webdriver.firefox.options import Options
 
															 from random import randrange
														
 
															 from pathlib import Path
														
 
															-root = Path(__file__).parents[2]
														
 
															+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
														
 
															 ###############
														
 
															 #
														
@@ -77,12 +77,10 @@ else:
 
															         "submissions/national-inventory-submissions-{}".format(year)
														
 
															     )
														
 
															-download_path = root / "downloaded_data" / "UNFCCC"
														
 
															-
														
 
															 error_file_sizes = [212, 210]
														
 
															 # Read submissions list
														
 
															-submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
														
 
															+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")
														
 
															 # filter submissions list or category
														
 
															 items = submissions[submissions.Kind  == category.upper()]
														
@@ -120,7 +118,7 @@ for idx, submission in items.iterrows():
 
															     country = country.replace(' ', '_')
														
 
															     print(f"Downloading {title} from {url}")
														
 
															-    country_folder = download_path / country
														
 
															+    country_folder = downloaded_data_path_UNFCCC / country
														
 
															     if not country_folder.exists():
														
 
															         country_folder.mkdir()
														
 
															     local_filename = \
														
@@ -167,7 +165,7 @@ for idx, submission in items.iterrows():
 
															         if local_filename.exists():
														
 
															             new_downloaded.append(submission)
														
 
															-            print(f"Download => {local_filename.relative_to(root)}")
														
 
															+            print(f"Download => {local_filename.relative_to(root_path)}")
														
 
															             # unzip data (only for new downloads)
														
 
															             if local_filename.suffix == ".zip":
														
 
															                 try:
														
@@ -177,18 +175,21 @@ for idx, submission in items.iterrows():
 
															                     zipped_file.close()
														
 
															                 # TODO Better error logging/visibilty
														
 
															                 except zipfile.BadZipFile:
														
 
															-                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
														
 
															+                    print(f"Error while trying to extract "
														
 
															+                          f"{local_filename.relative_to(root_path)}")
														
 
															                 except NotImplementedError:
														
 
															                     print("Zip format not supported, please unzip on the command line.")
														
 
															             else:
														
 
															-                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
														
 
															+                print(f"Not attempting to extract "
														
 
															+                      f"{local_filename.relative_to(root_path)}.")
														
 
															         else:
														
 
															-            print(f"Failed to download {local_filename.relative_to(root)}")
														
 
															+            print(f"Failed to download {local_filename.relative_to(root_path)}")
														
 
															     else:
														
 
															-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
														
 
															+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
														
 
															 driver.close()
														
 
															 df = pd.DataFrame(new_downloaded)
														
 
															-df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC
														
 
															+          / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
@@ -5,9 +5,9 @@ import time
 
															 import os
														
 
															 from datetime import date
														
 
															 from random import randrange
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
														
 
															 from pathlib import Path
														
 
															-root = Path(__file__).parents[2]
														
 
															+
														
 
															 """
														
 
															 based on download_bur from national-inventory-submissions
														
 
															 # (https://github.com/openclimatedata/national-inventory-submisions)
														
@@ -35,13 +35,11 @@ url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
 
															 error_file_sizes = [212, 210]
														
 
															 # Ensure download path and subfolders exist
														
 
															-download_path = root / "downloaded_data" / "UNFCCC"
														
 
															-if not download_path.exists():
														
 
															-    download_path.mkdir(parents=True)
														
 
															+if not downloaded_data_path_UNFCCC.exists():
														
 
															+    downloaded_data_path_UNFCCC.mkdir(parents=True)
														
 
															 new_downloaded = []
														
 
															-
														
 
															 for idx, submission in submissions.iterrows():
														
 
															     print("=" * 60)
														
 
															     ndc = submission.Number
														
@@ -54,12 +52,12 @@ for idx, submission in submissions.iterrows():
 
															     ndc_folder = "NDC_" + ndc + "_" + submission_date
														
 
															-    country_folder = download_path / country
														
 
															+    country_folder = downloaded_data_path_UNFCCC / country
														
 
															     if not country_folder.exists():
														
 
															         country_folder.mkdir()
														
 
															     local_filename = country_folder / ndc_folder / url.split('/')[-1]
														
 
															     local_filename_underscore = \
														
 
															-        download_path / country / ndc_folder / \
														
 
															+        downloaded_data_path_UNFCCC / country / ndc_folder / \
														
 
															         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
														
 
															     if not local_filename.parent.exists():
														
 
															         local_filename.parent.mkdir()
														
@@ -102,4 +100,4 @@ for idx, submission in submissions.iterrows():
 
															 df = pd.DataFrame(new_downloaded)
														
 
															-df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															 from pathlib import Path
														
 
															-
														
 
															-root = Path(__file__).parents[2]
														
 
															+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
														
 
															 ###############
														
 
															 #
														
@@ -45,8 +44,7 @@ else:
 
															 error_file_sizes = [212, 210]
														
 
															 # Read submissions list
														
 
															-download_path = root / "downloaded_data" / "UNFCCC"
														
 
															-submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
														
 
															+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{category.lower()}.csv")
														
 
															 # set options for headless mode
														
 
															 profile_path = ".firefox"
														
@@ -82,7 +80,7 @@ for idx, submission in submissions.iterrows():
 
															     country = country.replace(' ', '_')
														
 
															     print(f"Downloading {title} from {url}")
														
 
															-    country_folder = download_path / country
														
 
															+    country_folder = downloaded_data_path_UNFCCC / country
														
 
															     if not country_folder.exists():
														
 
															         country_folder.mkdir()
														
 
															     local_filename = \
														
@@ -129,14 +127,15 @@ for idx, submission in submissions.iterrows():
 
															         if local_filename.exists():
														
 
															             new_downloaded.append(submission)
														
 
															-            print(f"Download => {local_filename.relative_to(root)}")
														
 
															+            print(f"Download => {local_filename.relative_to(root_path)}")
														
 
															         else:
														
 
															-            print(f"Failed to download {local_filename.relative_to(root)}")
														
 
															+            print(f"Failed to download {local_filename.relative_to(root_path)}")
														
 
															     else:
														
 
															-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
														
 
															+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
														
 
															 driver.close()
														
 
															 df = pd.DataFrame(new_downloaded)
														
 
															-df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC /
														
 
															+          f"00_new_downloads_{category}-{date.today()}.csv", index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
@@ -8,8 +8,7 @@ from selenium.webdriver import Firefox
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															 from unfccc_submission_info import get_unfccc_submission_info
														
 
															-
														
 
															-root = Path(__file__).absolute().parents[2]
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
														
 
															 max_tries = 10
														
@@ -143,4 +142,4 @@ if len(no_downloads) > 0:
 
															 driver.close()
														
 
															 df = pd.DataFrame(downloads)
														
 
															-df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv", index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															 from unfccc_submission_info import get_unfccc_submission_info
														
 
															-
														
 
															-root = Path(__file__).absolute().parents[2]
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
														
 
															 """
														
 
															 Download UNFCCC Biennial Update Report submissions
														
@@ -84,4 +83,4 @@ if len(no_downloads) > 0:
 
															 driver.close()
														
 
															 df = pd.DataFrame(downloads)
														
 
															 df = df[["Kind", "Country", "Title", "URL"]]
														
 
															-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC / "submissions-bur.csv", index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
@@ -8,9 +8,8 @@ from bs4 import BeautifulSoup
 
															 from selenium.webdriver import Firefox
														
 
															 from selenium.webdriver.firefox.options import Options
														
 
															 from random import randrange
														
 
															-from unfccc_submission_info import get_unfccc_submission_info
														
 
															-
														
 
															-root = Path(__file__).absolute().parents[2]
														
 
															+from .unfccc_submission_info import get_unfccc_submission_info
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
														
 
															 """
														
 
															 Download UNFCCC Biennial Update Report submissions
														
@@ -85,4 +84,4 @@ if len(no_downloads) > 0:
 
															 driver.close()
														
 
															 df = pd.DataFrame(downloads)
														
 
															 df = df[["Kind", "Country", "Title", "URL"]]
														
 
															-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)
														
 
															+df.to_csv(downloaded_data_path_UNFCCC / "submissions-nc.csv", index=False)
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
@@ -6,7 +6,7 @@ import sys
 
															 import camelot
														
 
															 import primap2 as pm2
														
 
															 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
														
 
															-from pathlib import Path
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 # ###
														
 
															 # configuration
														
@@ -16,11 +16,6 @@ from pathlib import Path
 
															 #  PRIMAP2 version
														
 
															 # folders and files
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
														
 
															                'BUR4'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
@@ -5,9 +5,9 @@ import os
 
															 import sys
														
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 from primap2.pm2io._data_reading import matches_time_format
														
 
															 from primap2.pm2io._data_reading import filter_data
														
@@ -16,11 +16,6 @@ from primap2.pm2io._data_reading import filter_data
 
															 # ###
														
 
															 # folders and files
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
@@ -4,19 +4,12 @@
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 from primap2.pm2io._data_reading import matches_time_format
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
@@ -4,21 +4,14 @@
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 import camelot
														
 
															 import numpy as np
														
 
															 from primap2.pm2io._data_reading import matches_time_format
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
@@ -3,20 +3,13 @@
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 import camelot
														
 
															 from config_MEX_BUR3 import page_defs, fix_rows
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
@@ -5,25 +5,19 @@ import camelot
 
															 import primap2 as pm2
														
 
															 import pandas as pd
														
 
															 import copy
														
 
															-from pathlib import Path
														
 
															+
														
 
															 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
														
 
															     table_defs, header_defs
														
 
															 from primap2.pm2io._data_reading import matches_time_format, filter_data
														
 
															+from UNFCCC_GHG_data.helper import extracted_data_path, downloaded_data_path
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
														
 
															 output_filename = 'MAR_BUR3_2022_'
														
 
															-
														
 
															 inventory_file = 'Morocco_BUR3_Fr.pdf'
														
 
															-
														
 
															 gwp_to_use = 'AR4GWP100'
														
 
															 # years to read
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
@@ -5,24 +5,17 @@ import os
 
															 import sys
														
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 from config_KOR_BUR4 import cat_name_translations, cat_codes
														
 
															 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
														
 
															     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
														
 
															     filter_remove_after_agg
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 from primap2.pm2io._data_reading import filter_data, matches_time_format
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
														
 
															                '2021-Inventory'
														
 
															 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
@@ -5,19 +5,14 @@ import os
 
															 import sys
														
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 from config_KOR_BUR4 import cat_name_translations, cat_codes
														
 
															 from primap2.pm2io._data_reading import filter_data
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
														
 
															                '2020-Inventory'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
@@ -3,11 +3,10 @@
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 import camelot
														
 
															 import copy
														
 
															-#import re
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 from primap2.pm2io._data_reading import matches_time_format
														
 
															 from config_TWN_NIR2022 import table_defs, page_defs
														
@@ -17,12 +16,6 @@ from config_TWN_NIR2022 import gwp_to_use
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
														
 
															 # TODO: move file to subfolder
														
 
															 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
@@ -1,23 +1,16 @@
 
															 # this script reads data from Thailand's BUR3
														
 
															 # Data is read from the pdf file
														
 
															-
														
 
															 import pandas as pd
														
 
															 import primap2 as pm2
														
 
															-from pathlib import Path
														
 
															 import camelot
														
 
															 import copy
														
 
															+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
														
 
															 from primap2.pm2io._data_reading import matches_time_format
														
 
															 # ###
														
 
															 # configuration
														
 
															 # ###
														
 
															-root_path = Path(__file__).parents[3].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-
														
 
															-
														
 
															 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
														
 
															 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
														
 
															 if not output_folder.exists():
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/__init__.py
@@ -1,6 +1 @@
 
															-# expose some of the functions to the outside as they are used in other readers as well
														
 
															-# TODO: create a unified util module for all readers
														
 
															-
														
 
															-from .get_submissions_info import get_country_code
														
 
															-
														
 
															-__all__ = ["get_country_code"]
														
 
															+#
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
@@ -5,324 +5,11 @@ from typing import List, Dict
 
															 from pathlib import Path
														
 
															 import json
														
 
															 import pycountry
														
 
															-#import os
														
 
															-root_path = Path(__file__).parents[2].absolute()
														
 
															-root_path = root_path.resolve()
														
 
															-code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
														
 
															-# beware, folders below are different than for CRF reader
														
 
															-downloaded_data_path = root_path / "downloaded_data"
														
 
															-extracted_data_path = root_path / "extracted_data"
														
 
															-legacy_data_path = root_path / "legacy_data"
														
 
															-
														
 
															-# TODO: move this to general util package
														
 
															-custom_country_mapping = {
														
 
															-    "EUA": "European Union",
														
 
															-    "EUC": "European Union",
														
 
															-    "FRK": "France",
														
 
															-    "DKE": "Denmark",
														
 
															-    "DNM": "Denmark",
														
 
															-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
														
 
															-}
														
 
															-
														
 
															-custom_folders = {
														
 
															-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
														
 
															-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
														
 
															-    'Micronesia_(Federated_State_of)': 'FSM',
														
 
															-    'Micronesia_(Federated_States_of)': 'FSM',
														
 
															-    'The_Republic_of_North_Macedonia': 'MKD',
														
 
															-    'Republic_of_Korea': 'KOR',
														
 
															-    'Bolivia_(Plurinational_State_of)': 'BOL',
														
 
															-    'Türkiye': 'TUR',
														
 
															-    'Iran_(Islamic_Republic_of)': 'IRN',
														
 
															-    'Côte_d’Ivoire': 'CIV',
														
 
															-    'Democratic_Republic_of_the_Congo': "COD",
														
 
															-    'European_Union': 'EUA',
														
 
															-    'Taiwan': 'TWN',
														
 
															-}
														
 
															-
														
 
															-def get_country_submissions(
														
 
															-        country_name: str,
														
 
															-        print_sub: bool = True,
														
 
															-) -> Dict[str, List[str]]:
														
 
															-    """
														
 
															-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
														
 
															-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
														
 
															-    queries the folder mapping files for folders.
														
 
															-
														
 
															-    Parameters
														
 
															-    ----------
														
 
															-        country_name: str
														
 
															-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
														
 
															-
														
 
															-        print_sub: bool
														
 
															-            If True information on submissions will be written to stdout
														
 
															-
														
 
															-    Returns
														
 
															-    -------
														
 
															-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
														
 
															-        Each value is a list of folders
														
 
															-
														
 
															-    """
														
 
															-
														
 
															-    data_folder = downloaded_data_path
														
 
															-
														
 
															-    country_code = get_country_code(country_name)
														
 
															-
														
 
															-    if print_sub:
														
 
															-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
														
 
															-
														
 
															-    country_submissions = {}
														
 
															-    if print_sub:
														
 
															-        print(f"#" * 80)
														
 
															-        print(f"The following submissions are available for {country_name}")
														
 
															-    for item in data_folder.iterdir():
														
 
															-        if item.is_dir():
														
 
															-            if print_sub:
														
 
															-                print("")
														
 
															-                print("-" * 80)
														
 
															-                print(f"Data folder {item.name}")
														
 
															-                print("-" * 80)
														
 
															-            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															-                folder_mapping = json.load(mapping_file)
														
 
															-            if country_code in folder_mapping:
														
 
															-                country_folders = folder_mapping[country_code]
														
 
															-                if isinstance(country_folders, str):
														
 
															-                    # only one folder
														
 
															-                    country_folders = [country_folders]
														
 
															-
														
 
															-                submission_folders = []
														
 
															-                for country_folder in country_folders:
														
 
															-                    current_folder = item / country_folder
														
 
															-                    if print_sub:
														
 
															-                        print(f"Submissions in folder {country_folder}:")
														
 
															-
														
 
															-                    for submission_folder in current_folder.iterdir():
														
 
															-                        if submission_folder.is_dir():
														
 
															-                            if print_sub:
														
 
															-                                print(submission_folder.name)
														
 
															-                            submission_folders.append(submission_folder.name)
														
 
															-
														
 
															-                country_submissions[item.name] = submission_folders
														
 
															-            else:
														
 
															-                print(f"No submissions available for {country_name}.")
														
 
															-
														
 
															-    return country_submissions
														
 
															-
														
 
															-
														
 
															-def get_country_datasets(
														
 
															-        country_name: str,
														
 
															-        print_ds: bool = True,
														
 
															-) -> Dict[str, List[str]]:
														
 
															-    """
														
 
															-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the country's name.
														
 
															-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
														
 
															-    checks the UNFCCC_GHG_data and data folders for content on the country.
														
 
															-
														
 
															-    Parameters
														
 
															-    ----------
														
 
															-        country_name: str
														
 
															-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
														
 
															-
														
 
															-        print_ds: bool
														
 
															-            If True information on submissions will be written to stdout
														
 
															-
														
 
															-    Returns
														
 
															-    -------
														
 
															-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
														
 
															-        Each value is a list of folders
														
 
															-
														
 
															-    """
														
 
															-
														
 
															-    data_folder = extracted_data_path
														
 
															-    data_folder_legacy = legacy_data_path
														
 
															-
														
 
															-
														
 
															-    # obtain country UNFCCC_GHG_data
														
 
															-    country_code = get_country_code(country_name)
														
 
															-
														
 
															-    if print_ds:
														
 
															-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
														
 
															+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path, extracted_data_path
														
 
															+from UNFCCC_GHG_data.helper import get_country_code
														
 
															-    rep_data = {}
														
 
															-    # data
														
 
															-    if print_ds:
														
 
															-        print(f"#" * 80)
														
 
															-        print(f"The following datasets are available for {country_name}")
														
 
															-    for item in data_folder.iterdir():
														
 
															-        if item.is_dir():
														
 
															-            cleaned_datasets_current_folder = {}
														
 
															-            if print_ds:
														
 
															-                print("-" * 80)
														
 
															-                print(f"Data folder {item.name}")
														
 
															-                print("-" * 80)
														
 
															-            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															-                folder_mapping = json.load(mapping_file)
														
 
															-            if country_code not in folder_mapping:
														
 
															-                if print_ds:
														
 
															-                    print("No data available")
														
 
															-                    print("")
														
 
															-            else:
														
 
															-                country_folder = folder_mapping[country_code]
														
 
															-                if not isinstance(country_folder, str):
														
 
															-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															-
														
 
															-                datasets_current_folder = {}
														
 
															-                current_folder = item / country_folder
														
 
															-
														
 
															-                for data_file in current_folder.iterdir():
														
 
															-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															-                        if data_file.stem in datasets_current_folder:
														
 
															-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															-                        else:
														
 
															-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															-
														
 
															-                for dataset in datasets_current_folder:
														
 
															-                    # process filename to get submission
														
 
															-                    parts = dataset.split('_')
														
 
															-                    if parts[0] != country_code:
														
 
															-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
														
 
															-                    else:
														
 
															-                        terminology = "_".join(parts[3 : ])
														
 
															-                        key = f"{parts[1]} ({parts[2]}, {terminology})"
														
 
															-                        data_info = ""
														
 
															-                        if '.nc' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "NF (.nc), "
														
 
															-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															-                            data_info = data_info + "IF (.yaml + .csv), "
														
 
															-                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "incomplete IF? (.csv), "
														
 
															-                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "incomplete IF (.yaml), "
														
 
															-
														
 
															-                        code_file = get_code_file(country_code, parts[1])
														
 
															-                        if code_file:
														
 
															-                            data_info = data_info + f"UNFCCC_GHG_data: {code_file.name}"
														
 
															-                        else:
														
 
															-                            data_info = data_info + f"UNFCCC_GHG_data: not found"
														
 
															-
														
 
															-                        cleaned_datasets_current_folder[key] = data_info
														
 
															-
														
 
															-                if print_ds:
														
 
															-                    if cleaned_datasets_current_folder:
														
 
															-                        for country_ds in cleaned_datasets_current_folder:
														
 
															-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															-                    else:
														
 
															-                        print("No data available")
														
 
															-                    print("")
														
 
															-
														
 
															-            rep_data[item.name] = cleaned_datasets_current_folder
														
 
															-
														
 
															-    # legacy data
														
 
															-    if print_ds:
														
 
															-        print(f"#" * 80)
														
 
															-        print(f"The following legacy datasets are available for {country_name}")
														
 
															-    legacy_data = {}
														
 
															-    for item in data_folder_legacy.iterdir():
														
 
															-        if item.is_dir():
														
 
															-            cleaned_datasets_current_folder = {}
														
 
															-            if print_ds:
														
 
															-                print("-" * 80)
														
 
															-                print(f"Data folder {item.name}")
														
 
															-                print("-" * 80)
														
 
															-            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															-                folder_mapping = json.load(mapping_file)
														
 
															-            if country_code not in folder_mapping:
														
 
															-                if print_ds:
														
 
															-                    print("No data available")
														
 
															-                    print("")
														
 
															-            else:
														
 
															-                country_folder = folder_mapping[country_code]
														
 
															-                if not isinstance(country_folder, str):
														
 
															-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															-
														
 
															-                datasets_current_folder = {}
														
 
															-                current_folder = item / country_folder
														
 
															-
														
 
															-                for data_file in current_folder.iterdir():
														
 
															-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															-                        if data_file.stem in datasets_current_folder:
														
 
															-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															-                        else:
														
 
															-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															-
														
 
															-                for dataset in datasets_current_folder:
														
 
															-                    # process filename to get submission
														
 
															-                    parts = dataset.split('_')
														
 
															-                    if parts[0] != country_code:
														
 
															-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
														
 
															-                    else:
														
 
															-                        terminology = "_".join(parts[3 : ])
														
 
															-                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
														
 
															-                        data_info = ""
														
 
															-                        if '.nc' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "NF (.nc), "
														
 
															-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															-                            data_info = data_info + "IF (.yaml + .csv), "
														
 
															-                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "incomplete IF? (.csv), "
														
 
															-                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															-                            data_info = data_info + "incomplete IF (.yaml), "
														
 
															-
														
 
															-                        cleaned_datasets_current_folder[key] = data_info
														
 
															-
														
 
															-                if print_ds:
														
 
															-                    if cleaned_datasets_current_folder:
														
 
															-                        for country_ds in cleaned_datasets_current_folder:
														
 
															-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															-                    else:
														
 
															-                        print("No data available")
														
 
															-                    print("")
														
 
															-
														
 
															-                legacy_data[item.name] = cleaned_datasets_current_folder
														
 
															-
														
 
															-    all_data = {
														
 
															-        "rep_data": rep_data,
														
 
															-        "legacy_data": legacy_data,
														
 
															-    }
														
 
															-
														
 
															-    return all_data
														
 
															-
														
 
															-
														
 
															-def get_country_code(
														
 
															-        country_name: str,
														
 
															-)->str:
														
 
															-    """
														
 
															-    obtain country UNFCCC_GHG_data. If the input is a UNFCCC_GHG_data it will be returned, if the input
														
 
															-    is not a three letter UNFCCC_GHG_data a search will be performed
														
 
															-
														
 
															-    Parameters
														
 
															-    __________
														
 
															-    country_name: str
														
 
															-        Country UNFCCC_GHG_data or name to get the three-letter UNFCCC_GHG_data for.
														
 
															-
														
 
															-    """
														
 
															-    # First check if it's in the list of custom codes
														
 
															-    if country_name in custom_country_mapping:
														
 
															-        country_code = country_name
														
 
															-    else:
														
 
															-        try:
														
 
															-            # check if it's a 3 letter UNFCCC_GHG_data
														
 
															-            country = pycountry.countries.get(alpha_3=country_name)
														
 
															-            country_code = country.alpha_3
														
 
															-        except:
														
 
															-            try:
														
 
															-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
														
 
															-            except:
														
 
															-                raise ValueError(f"Country name {country_name} can not be mapped to "
														
 
															-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
														
 
															-            if len(country) > 1:
														
 
															-                country_code = None
														
 
															-                for current_country in country:
														
 
															-                    if current_country.name == country_name:
														
 
															-                        country_code = current_country.alpha_3
														
 
															-                if country_code is None:
														
 
															-                    raise ValueError(f"Country name {country_name} has {len(country)} "
														
 
															-                                     f"possible results for country codes.")
														
 
															-
														
 
															-            country_code = country[0].alpha_3
														
 
															-
														
 
															-    return country_code
														
 
															+code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
														
 
															 def get_possible_inputs(
														
@@ -446,128 +133,7 @@ def get_possible_outputs(
 
															     return output_files
														
 
															-def get_code_file(
														
 
															-        country_name: str,
														
 
															-        submission: str,
														
 
															-        print_info: bool = False,
														
 
															-) -> Path:
														
 
															-    """
														
 
															-    For given country name and submission find the script that creates the data
														
 
															-    Parameters
														
 
															-    ----------
														
 
															-        country_name: str
														
 
															-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
														
 
															-
														
 
															-        submission: str
														
 
															-            String of the submission
														
 
															-
														
 
															-        print_info: bool = False
														
 
															-            If True print information on UNFCCC_GHG_data found
														
 
															-
														
 
															-    Returns
														
 
															-    -------
														
 
															-        returns a pathlib Path object for the UNFCCC_GHG_data file
														
 
															-    """
														
 
															-
														
 
															-    code_file_path = None
														
 
															-
														
 
															-    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
														
 
															-    # so we return the path to that.
														
 
															-    if submission[0:3] == "CRF":
														
 
															-        return root_path / "UNFCCC_CRF_reader"
														
 
															-    # obtain country UNFCCC_GHG_data
														
 
															-    country_code = get_country_code(country_name)
														
 
															-
														
 
															-    if print_info:
														
 
															-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
														
 
															-
														
 
															-    with open(code_path / "folder_mapping.json", "r") as mapping_file:
														
 
															-        folder_mapping = json.load(mapping_file)
														
 
															-
														
 
															-    if country_code not in folder_mapping:
														
 
															-        if print_info:
														
 
															-            print("No UNFCCC_GHG_data available")
														
 
															-            print("")
														
 
															-    else:
														
 
															-        country_folder = code_path / folder_mapping[country_code]
														
 
															-        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
														
 
															-
														
 
															-        for file in country_folder.iterdir():
														
 
															-            if file.match(code_file_name_candidate):
														
 
															-                if code_file_path is not None:
														
 
															-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
														
 
															-                                     f"{code_file_path} and file.name. "
														
 
															-                                     f"Please use only one file with name "
														
 
															-                                     f"'read_ISO3_submission_XXX.YYY'.")
														
 
															-                else:
														
 
															-                    if print_info:
														
 
															-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
														
 
															-                code_file_path = file
														
 
															-
														
 
															-    if code_file_path is not None:
														
 
															-        return code_file_path.relative_to(root_path)
														
 
															-    else:
														
 
															-        return None
														
 
															-
														
 
															-
														
 
															-def create_folder_mapping(
														
 
															-        folder: str,
														
 
															-        extracted: bool = False
														
 
															-) -> None:
														
 
															-    """
														
 
															-    Create a mapping from 3 letter ISO country codes to folders
														
 
															-    based on the subfolders of the given folder. The mapping is
														
 
															-    stored in 'folder_mapping.json' in the given folder. Folder
														
 
															-    must be given relative to the repository root
														
 
															-    Parameters
														
 
															-    ----------
														
 
															-        folder: str
														
 
															-            folder to create the mapping for
														
 
															-        extracted: bool = False
														
 
															-            If true treat the folder as extracted data, where we
														
 
															-            only have one folder per country and no typos in the
														
 
															-            names
														
 
															-
														
 
															-    Returns
														
 
															-    -------
														
 
															-        Nothing
														
 
															-
														
 
															-    """
														
 
															-    folder = root_path / folder
														
 
															-    folder_mapping = {}
														
 
															-    #if not extracted:
														
 
															-    known_folders = custom_folders
														
 
															-    #else:
														
 
															-    #    known_folders = {}
														
 
															-
														
 
															-    for item in folder.iterdir():
														
 
															-        if item.is_dir() and not item.match("__pycache__"):
														
 
															-            if item.name in known_folders:
														
 
															-                ISO3 = known_folders[item.name]
														
 
															-            else:
														
 
															-                try:
														
 
															-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
														
 
															-                    if len(country) > 1:
														
 
															-                        ISO3 = None
														
 
															-                        for current_country in country:
														
 
															-                            if current_country.name == item.name.replace("_", " "):
														
 
															-                                ISO3 = current_country.alpha_3
														
 
															-                    else:
														
 
															-                        ISO3 = country[0].alpha_3
														
 
															-                except:
														
 
															-                    ISO3 = None
														
 
															-
														
 
															-            if ISO3 is None:
														
 
															-                print(f"No match for {item.name}")
														
 
															-            else:
														
 
															-                if ISO3 in folder_mapping.keys():
														
 
															-                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
														
 
															-                else:
														
 
															-                    folder_mapping[ISO3] = item.name
														
 
															-
														
 
															-    with open(folder / "folder_mapping.json", "w") as mapping_file:
														
 
															-        json.dump(folder_mapping, mapping_file, indent=4)
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
@@ -1,15 +1,12 @@
 
															 # this script takes submission and country as input (from doit) and
														
 
															 # runs the appropriate script to extract the submission data
														
 
															-import sys
														
 
															 import datalad.api
														
 
															-from pathlib import Path
														
 
															 import argparse
														
 
															 from get_submissions_info import get_code_file
														
 
															 from get_submissions_info import get_possible_inputs
														
 
															 from get_submissions_info import get_possible_outputs
														
 
															-
														
 
															-
														
 
															+from UNFCCC_GHG_data.helper import root_path
														
 
															 # Find the right function and possible input and output files and
														
 
															 # read the data using datalad run.
														
@@ -22,9 +19,6 @@ args = parser.parse_args()
 
															 country = args.country
														
 
															 submission = args.submission
														
 
															-codepath = Path(__file__).parent
														
 
															-rootpath = codepath / ".." / ".."
														
 
															-rootpath = rootpath.resolve()
														
 
															 print(f"Attempting to extract data for {submission} from {country}.")
														
 
															 print("#"*80)
														
@@ -49,7 +43,7 @@ if script_name is not None:
 
															         print("")
														
 
															     # make input files absolute to avoid datalad confusions when
														
 
															     # root directory is via symlink
														
 
															-    input_files = [rootpath / file for file in input_files]
														
 
															+    input_files = [root_path / file for file in input_files]
														
 
															     # convert file's path to str
														
 
															     input_files = [file.as_posix() for file in input_files]
														
@@ -69,7 +63,7 @@ if script_name is not None:
 
															     print(f"Run the script using datalad run via the python api")
														
 
															     datalad.api.run(
														
 
															         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
														
 
															-        dataset=rootpath,
														
 
															+        dataset=root_path,
														
 
															         message=f"Read data for {country}, {submission}.",
														
 
															         inputs=input_files,
														
 
															         outputs=output_files,
														
--- a/UNFCCC_GHG_data/__init__.py
+++ b/UNFCCC_GHG_data/__init__.py
@@ -2,7 +2,12 @@
 
															 from . import UNFCCC_reader
														
 
															 from . import UNFCCC_CRF_reader
														
 
															+from . import helper
														
 
															 # import UNFCCC_DI_reader
														
 
															 # import UNFCCC_downloader
														
 
															-__all__ = ["UNFCCC_reader", "UNFCCC_CRF_reader"]
														
 
															+__all__ = [
														
 
															+    "UNFCCC_reader",
														
 
															+    "UNFCCC_CRF_reader",
														
 
															+    "helper",
														
 
															+]
														
--- a/UNFCCC_GHG_data/helper/__init__.py
+++ b/UNFCCC_GHG_data/helper/__init__.py
@@ -0,0 +1,24 @@
 
															+from .definitions import root_path, code_path, log_path
														
 
															+from .definitions import extracted_data_path, extracted_data_path_UNFCCC
														
 
															+from .definitions import legacy_data_path
														
 
															+from .definitions import downloaded_data_path, downloaded_data_path_UNFCCC
														
 
															+from .definitions import dataset_path, dataset_path_UNFCCC
														
 
															+from .definitions import custom_country_mapping, custom_folders
														
 
															+from .functions import get_country_code, get_country_name
														
 
															+
														
 
															+__all__ = [
														
 
															+    "root_path",
														
 
															+    "code_path",
														
 
															+    "log_path",
														
 
															+    "extracted_data_path",
														
 
															+    "extracted_data_path_UNFCCC",
														
 
															+    "legacy_data_path",
														
 
															+    "downloaded_data_path",
														
 
															+    "downloaded_data_path_UNFCCC",
														
 
															+    "dataset_path",
														
 
															+    "dataset_path_UNFCCC",
														
 
															+    "custom_country_mapping",
														
 
															+    "custom_folders",
														
 
															+    "get_country_code",
														
 
															+    "get_country_name",
														
 
															+]
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/country_info.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/country_info.py
@@ -2,8 +2,8 @@
 
															 # runs displays available submissions and datasets
														
 
															 import argparse
														
 
															-from get_submissions_info import get_country_submissions
														
 
															-from get_submissions_info import get_country_datasets
														
 
															+from UNFCCC_GHG_data.helper.functions import get_country_submissions
														
 
															+from UNFCCC_GHG_data.helper.functions import get_country_datasets
														
 
															 # Find the right function and possible input and output files and
														
 
															 # read the data using datalad run.
														
--- a/UNFCCC_GHG_data/helper/definitions.py
+++ b/UNFCCC_GHG_data/helper/definitions.py
@@ -0,0 +1,49 @@
 
															+import os
														
 
															+from pathlib import Path
														
 
															+
														
 
															+
														
 
															+def get_root_path() -> Path:
														
 
															+    """ get the root_path from an environment variable """
														
 
															+    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
														
 
															+    if root_path_env is None:
														
 
															+        raise ValueError('UNFCCC_GHG_ROOT_PATH environment variable needs to be set')
														
 
															+    else:
														
 
															+        root_path = Path(root_path_env).resolve()
														
 
															+    return root_path
														
 
															+
														
 
															+root_path = get_root_path()
														
 
															+code_path = root_path / "UNFCCC_GHG_data"
														
 
															+log_path = root_path / "log"
														
 
															+extracted_data_path = root_path / "extracted_data"
														
 
															+extracted_data_path_UNFCCC = extracted_data_path / "UNFCCC"
														
 
															+downloaded_data_path = root_path / "downloaded_data"
														
 
															+downloaded_data_path_UNFCCC = downloaded_data_path / "UNFCCC"
														
 
															+legacy_data_path = root_path / "legacy_data"
														
 
															+dataset_path = root_path / "datasets"
														
 
															+dataset_path_UNFCCC = dataset_path / "UNFCCC"
														
 
															+
														
 
															+
														
 
															+custom_country_mapping = {
														
 
															+    "EUA": "European Union",
														
 
															+    "EUC": "European Union",
														
 
															+    "FRK": "France",
														
 
															+    "DKE": "Denmark",
														
 
															+    "DNM": "Denmark",
														
 
															+    "GBK": "United Kingdom of Great Britain and Northern Ireland",
														
 
															+}
														
 
															+
														
 
															+custom_folders = {
														
 
															+    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
														
 
															+    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
														
 
															+    'Micronesia_(Federated_State_of)': 'FSM',
														
 
															+    'Micronesia_(Federated_States_of)': 'FSM',
														
 
															+    'The_Republic_of_North_Macedonia': 'MKD',
														
 
															+    'Republic_of_Korea': 'KOR',
														
 
															+    'Bolivia_(Plurinational_State_of)': 'BOL',
														
 
															+    'Türkiye': 'TUR',
														
 
															+    'Iran_(Islamic_Republic_of)': 'IRN',
														
 
															+    'Côte_d’Ivoire': 'CIV',
														
 
															+    'Democratic_Republic_of_the_Congo': "COD",
														
 
															+    'European_Union': 'EUA',
														
 
															+    'Taiwan': 'TWN',
														
 
															+}
														
--- a/UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py
@@ -3,7 +3,7 @@
 
															 # oir that folder
														
 
															 import argparse
														
 
															-from get_submissions_info import create_folder_mapping
														
 
															+from .functions import create_folder_mapping
														
 
															 # Find the right function and possible input and output files and
														
 
															 # read the data using datalad run.
														
--- a/UNFCCC_GHG_data/helper/functions.py
+++ b/UNFCCC_GHG_data/helper/functions.py
@@ -0,0 +1,445 @@
 
															+import pycountry
														
 
															+import json
														
 
															+from typing import Dict, List
														
 
															+from pathlib import Path
														
 
															+from .definitions import custom_country_mapping, custom_folders
														
 
															+from .definitions import root_path, downloaded_data_path, extracted_data_path
														
 
															+from .definitions import legacy_data_path, code_path
														
 
															+
														
 
															+
														
 
															+def get_country_name(
														
 
															+        country_code: str,
														
 
															+) -> str:
														
 
															+    """get country name from code """
														
 
															+    if country_code in custom_country_mapping:
														
 
															+        country_name = custom_country_mapping[country_code]
														
 
															+    else:
														
 
															+        try:
														
 
															+            country = pycountry.countries.get(alpha_3=country_code)
														
 
															+            country_name = country.name
														
 
															+        except:
														
 
															+            raise ValueError(f"Country code {country_code} can not be mapped to "
														
 
															+                             f"any country")
														
 
															+
														
 
															+    return country_name
														
 
															+
														
 
															+
														
 
															+def get_country_code(
														
 
															+        country_name: str,
														
 
															+)->str:
														
 
															+    """
														
 
															+    obtain country code. If the input is a code it will be returned,
														
 
															+    if the input
														
 
															+    is not a three letter code a search will be performed
														
 
															+
														
 
															+    Parameters
														
 
															+    __________
														
 
															+    country_name: str
														
 
															+        Country code or name to get the three-letter code for.
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+        country_code: str
														
 
															+
														
 
															+    """
														
 
															+    # First check if it's in the list of custom codes
														
 
															+    if country_name in custom_country_mapping:
														
 
															+        country_code = country_name
														
 
															+    else:
														
 
															+        try:
														
 
															+            # check if it's a 3 letter UNFCCC_GHG_data
														
 
															+            country = pycountry.countries.get(alpha_3=country_name)
														
 
															+            country_code = country.alpha_3
														
 
															+        except:
														
 
															+            try:
														
 
															+                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
														
 
															+            except:
														
 
															+                raise ValueError(f"Country name {country_name} can not be mapped to "
														
 
															+                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
														
 
															+            if len(country) > 1:
														
 
															+                country_code = None
														
 
															+                for current_country in country:
														
 
															+                    if current_country.name == country_name:
														
 
															+                        country_code = current_country.alpha_3
														
 
															+                if country_code is None:
														
 
															+                    raise ValueError(f"Country name {country_name} has {len(country)} "
														
 
															+                                     f"possible results for country codes.")
														
 
															+
														
 
															+            country_code = country[0].alpha_3
														
 
															+
														
 
															+    return country_code
														
 
															+
														
 
															+
														
 
															+def create_folder_mapping(
														
 
															+        folder: str,
														
 
															+        extracted: bool = False
														
 
															+) -> None:
														
 
															+    """
														
 
															+    Create a mapping from 3 letter ISO country codes to folders
														
 
															+    based on the subfolders of the given folder. The mapping is
														
 
															+    stored in 'folder_mapping.json' in the given folder. Folder
														
 
															+    must be given relative to the repository root
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+        folder: str
														
 
															+            folder to create the mapping for
														
 
															+        extracted: bool = False
														
 
															+            If true treat the folder as extracted data, where we
														
 
															+            only have one folder per country and no typos in the
														
 
															+            names
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+        Nothing
														
 
															+
														
 
															+    """
														
 
															+
														
 
															+    folder = root_path / folder
														
 
															+    folder_mapping = {}
														
 
															+    #if not extracted:
														
 
															+    known_folders = custom_folders
														
 
															+    #else:
														
 
															+    #    known_folders = {}
														
 
															+
														
 
															+    for item in folder.iterdir():
														
 
															+        if item.is_dir() and not item.match("__pycache__"):
														
 
															+            if item.name in known_folders:
														
 
															+                ISO3 = known_folders[item.name]
														
 
															+            else:
														
 
															+                try:
														
 
															+                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
														
 
															+                    if len(country) > 1:
														
 
															+                        ISO3 = None
														
 
															+                        for current_country in country:
														
 
															+                            if current_country.name == item.name.replace("_", " "):
														
 
															+                                ISO3 = current_country.alpha_3
														
 
															+                    else:
														
 
															+                        ISO3 = country[0].alpha_3
														
 
															+                except:
														
 
															+                    ISO3 = None
														
 
															+
														
 
															+            if ISO3 is None:
														
 
															+                print(f"No match for {item.name}")
														
 
															+            else:
														
 
															+                if ISO3 in folder_mapping.keys():
														
 
															+                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
														
 
															+                else:
														
 
															+                    folder_mapping[ISO3] = item.name
														
 
															+
														
 
															+    with open(folder / "folder_mapping.json", "w") as mapping_file:
														
 
															+        json.dump(folder_mapping, mapping_file, indent=4)
														
 
															+
														
 
															+
														
 
															+# TODO add crf
														
 
															+def get_country_submissions(
														
 
															+        country_name: str,
														
 
															+        print_sub: bool = True,
														
 
															+) -> Dict[str, List[str]]:
														
 
															+    """
														
 
															+    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
														
 
															+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
														
 
															+    queries the folder mapping files for folders.
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+        country_name: str
														
 
															+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
														
 
															+
														
 
															+        print_sub: bool
														
 
															+            If True information on submissions will be written to stdout
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
														
 
															+        Each value is a list of folders
														
 
															+
														
 
															+    """
														
 
															+
														
 
															+    data_folder = downloaded_data_path
														
 
															+
														
 
															+    country_code = get_country_code(country_name)
														
 
															+
														
 
															+    if print_sub:
														
 
															+        print(f"Country name {country_name} maps to ISO code {country_code}")
														
 
															+
														
 
															+    country_submissions = {}
														
 
															+    if print_sub:
														
 
															+        print(f"#" * 80)
														
 
															+        print(f"The following submissions are available for {country_name}")
														
 
															+    for item in data_folder.iterdir():
														
 
															+        if item.is_dir():
														
 
															+            if print_sub:
														
 
															+                print("")
														
 
															+                print("-" * 80)
														
 
															+                print(f"Data folder {item.name}")
														
 
															+                print("-" * 80)
														
 
															+            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															+                folder_mapping = json.load(mapping_file)
														
 
															+            if country_code in folder_mapping:
														
 
															+                country_folders = folder_mapping[country_code]
														
 
															+                if isinstance(country_folders, str):
														
 
															+                    # only one folder
														
 
															+                    country_folders = [country_folders]
														
 
															+
														
 
															+                submission_folders = []
														
 
															+                for country_folder in country_folders:
														
 
															+                    current_folder = item / country_folder
														
 
															+                    if print_sub:
														
 
															+                        print(f"Submissions in folder {country_folder}:")
														
 
															+
														
 
															+                    for submission_folder in current_folder.iterdir():
														
 
															+                        if submission_folder.is_dir():
														
 
															+                            if print_sub:
														
 
															+                                print(submission_folder.name)
														
 
															+                            submission_folders.append(submission_folder.name)
														
 
															+
														
 
															+                country_submissions[item.name] = submission_folders
														
 
															+            else:
														
 
															+                print(f"No submissions available for {country_name}.")
														
 
															+
														
 
															+    return country_submissions
														
 
															+
														
 
															+
														
 
															+def get_country_datasets(
														
 
															+        country_name: str,
														
 
															+        print_ds: bool = True,
														
 
															+) -> Dict[str, List[str]]:
														
 
															+    """
														
 
															+    Input is a three letter ISO code for a country, or the country's name.
														
 
															+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
														
 
															+    checks the UNFCCC_GHG_data and data folders for content on the country.
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+        country_name: str
														
 
															+            String containing the country name or ISO 3 letter code
														
 
															+
														
 
															+        print_ds: bool
														
 
															+            If True information on submissions will be written to stdout
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
														
 
															+        Each value is a list of folders
														
 
															+
														
 
															+    """
														
 
															+
														
 
															+    data_folder = extracted_data_path
														
 
															+    data_folder_legacy = legacy_data_path
														
 
															+
														
 
															+    # obtain country UNFCCC_GHG_data
														
 
															+    country_code = get_country_code(country_name)
														
 
															+
														
 
															+    if print_ds:
														
 
															+        print(f"Country name {country_name} maps to ISO code {country_code}")
														
 
															+
														
 
															+    rep_data = {}
														
 
															+    # data
														
 
															+    if print_ds:
														
 
															+        print(f"#" * 80)
														
 
															+        print(f"The following datasets are available for {country_name}")
														
 
															+    for item in data_folder.iterdir():
														
 
															+        if item.is_dir():
														
 
															+            cleaned_datasets_current_folder = {}
														
 
															+            if print_ds:
														
 
															+                print("-" * 80)
														
 
															+                print(f"Data folder {item.name}")
														
 
															+                print("-" * 80)
														
 
															+            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															+                folder_mapping = json.load(mapping_file)
														
 
															+            if country_code not in folder_mapping:
														
 
															+                if print_ds:
														
 
															+                    print("No data available")
														
 
															+                    print("")
														
 
															+            else:
														
 
															+                country_folder = folder_mapping[country_code]
														
 
															+                if not isinstance(country_folder, str):
														
 
															+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															+
														
 
															+                datasets_current_folder = {}
														
 
															+                current_folder = item / country_folder
														
 
															+
														
 
															+                for data_file in current_folder.iterdir():
														
 
															+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															+                        if data_file.stem in datasets_current_folder:
														
 
															+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															+                        else:
														
 
															+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															+
														
 
															+                for dataset in datasets_current_folder:
														
 
															+                    # process filename to get submission
														
 
															+                    parts = dataset.split('_')
														
 
															+                    if parts[0] != country_code:
														
 
															+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
														
 
															+                            dataset
														
 
															+                    else:
														
 
															+                        terminology = "_".join(parts[3 : ])
														
 
															+                        key = f"{parts[1]} ({parts[2]}, {terminology})"
														
 
															+                        data_info = ""
														
 
															+                        if '.nc' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "NF (.nc), "
														
 
															+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															+                            data_info = data_info + "IF (.yaml + .csv), "
														
 
															+                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "incomplete IF? (.csv), "
														
 
															+                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "incomplete IF (.yaml), "
														
 
															+
														
 
															+                        code_file = get_code_file(country_code, parts[1])
														
 
															+                        if code_file:
														
 
															+                            data_info = data_info + f"code: {code_file.name}"
														
 
															+                        else:
														
 
															+                            data_info = data_info + f"code: not found"
														
 
															+
														
 
															+                        cleaned_datasets_current_folder[key] = data_info
														
 
															+
														
 
															+                if print_ds:
														
 
															+                    if cleaned_datasets_current_folder:
														
 
															+                        for country_ds in cleaned_datasets_current_folder:
														
 
															+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															+                    else:
														
 
															+                        print("No data available")
														
 
															+                    print("")
														
 
															+
														
 
															+            rep_data[item.name] = cleaned_datasets_current_folder
														
 
															+
														
 
															+    # legacy data
														
 
															+    if print_ds:
														
 
															+        print(f"#" * 80)
														
 
															+        print(f"The following legacy datasets are available for {country_name}")
														
 
															+    legacy_data = {}
														
 
															+    for item in data_folder_legacy.iterdir():
														
 
															+        if item.is_dir():
														
 
															+            cleaned_datasets_current_folder = {}
														
 
															+            if print_ds:
														
 
															+                print("-" * 80)
														
 
															+                print(f"Data folder {item.name}")
														
 
															+                print("-" * 80)
														
 
															+            with open(item / "folder_mapping.json", "r") as mapping_file:
														
 
															+                folder_mapping = json.load(mapping_file)
														
 
															+            if country_code not in folder_mapping:
														
 
															+                if print_ds:
														
 
															+                    print("No data available")
														
 
															+                    print("")
														
 
															+            else:
														
 
															+                country_folder = folder_mapping[country_code]
														
 
															+                if not isinstance(country_folder, str):
														
 
															+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
														
 
															+
														
 
															+                datasets_current_folder = {}
														
 
															+                current_folder = item / country_folder
														
 
															+
														
 
															+                for data_file in current_folder.iterdir():
														
 
															+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
														
 
															+                        if data_file.stem in datasets_current_folder:
														
 
															+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
														
 
															+                        else:
														
 
															+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
														
 
															+
														
 
															+                for dataset in datasets_current_folder:
														
 
															+                    # process filename to get submission
														
 
															+                    parts = dataset.split('_')
														
 
															+                    if parts[0] != country_code:
														
 
															+                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
														
 
															+                    else:
														
 
															+                        terminology = "_".join(parts[3 : ])
														
 
															+                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
														
 
															+                        data_info = ""
														
 
															+                        if '.nc' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "NF (.nc), "
														
 
															+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
														
 
															+                            data_info = data_info + "IF (.yaml + .csv), "
														
 
															+                        elif '.csv' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "incomplete IF? (.csv), "
														
 
															+                        elif '.yaml' in datasets_current_folder[dataset]:
														
 
															+                            data_info = data_info + "incomplete IF (.yaml), "
														
 
															+
														
 
															+                        cleaned_datasets_current_folder[key] = data_info
														
 
															+
														
 
															+                if print_ds:
														
 
															+                    if cleaned_datasets_current_folder:
														
 
															+                        for country_ds in cleaned_datasets_current_folder:
														
 
															+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
														
 
															+                    else:
														
 
															+                        print("No data available")
														
 
															+                    print("")
														
 
															+
														
 
															+                legacy_data[item.name] = cleaned_datasets_current_folder
														
 
															+
														
 
															+    all_data = {
														
 
															+        "rep_data": rep_data,
														
 
															+        "legacy_data": legacy_data,
														
 
															+    }
														
 
															+
														
 
															+    return all_data
														
 
															+
														
 
															+
														
 
															+def get_code_file(
														
 
															+        country_name: str,
														
 
															+        submission: str,
														
 
															+        print_info: bool = False,
														
 
															+) -> Path:
														
 
															+    """
														
 
															+    For given country name and submission find the script that creates the data
														
 
															+
														
 
															+    Parameters
														
 
															+    ----------
														
 
															+        country_name: str
														
 
															+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
														
 
															+
														
 
															+        submission: str
														
 
															+            String of the submission
														
 
															+
														
 
															+        print_info: bool = False
														
 
															+            If True print information on UNFCCC_GHG_data found
														
 
															+
														
 
															+    Returns
														
 
															+    -------
														
 
															+        returns a pathlib Path object for the UNFCCC_GHG_data file
														
 
															+    """
														
 
															+
														
 
															+    code_file_path = None
														
 
															+    UNFCCC_reader_path = code_path / "UNFCCC_reader"
														
 
															+
														
 
															+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
														
 
															+    # so we return the path to that.
														
 
															+    if submission[0:3] == "CRF":
														
 
															+        return root_path / "UNFCCC_CRF_reader"
														
 
															+
														
 
															+    if submission[0:2] == "DI":
														
 
															+        return root_path / "UNFCCC_DI_reader"
														
 
															+
														
 
															+    # obtain country UNFCCC_GHG_data
														
 
															+    country_code = get_country_code(country_name)
														
 
															+
														
 
															+    if print_info:
														
 
															+        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
														
 
															+
														
 
															+    with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
														
 
															+        folder_mapping = json.load(mapping_file)
														
 
															+
														
 
															+    if country_code not in folder_mapping:
														
 
															+        if print_info:
														
 
															+            print("No UNFCCC_GHG_data available")
														
 
															+            print("")
														
 
															+    else:
														
 
															+        country_folder = UNFCCC_reader_path / folder_mapping[country_code]
														
 
															+        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
														
 
															+
														
 
															+        for file in country_folder.iterdir():
														
 
															+            if file.match(code_file_name_candidate):
														
 
															+                if code_file_path is not None:
														
 
															+                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
														
 
															+                                     f"{code_file_path} and file.name. "
														
 
															+                                     f"Please use only one file with name "
														
 
															+                                     f"'read_ISO3_submission_XXX.YYY'.")
														
 
															+                else:
														
 
															+                    if print_info:
														
 
															+                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
														
 
															+                code_file_path = file
														
 
															+
														
 
															+    if code_file_path is not None:
														
 
															+        return code_file_path.relative_to(root_path)
														
 
															+    else:
														
 
															+        return None
														
--- a/dodo.py
+++ b/dodo.py
@@ -1,5 +1,6 @@
 
															 # define tasks for UNFCCC data repository
														
 
															 from doit import get_var
														
 
															+import os
														
 
															 # TODO: task for folder mapping
														
@@ -18,6 +19,18 @@ def task_setup_venv():
 
															         'verbosity': 2,
														
 
															     }
														
 
															+# set UNFCCC_GHG_ROOT_PATH environment variable
														
 
															+def task_set_env():
														
 
															+    """
														
 
															+    Set the environment variable for the module so data is stored in the correct folders
														
 
															+    """
														
 
															+    def set_root_path():
														
 
															+        os.environ["UNFCCC_GHG_ROOT_PATH"] = "."
														
 
															+
														
 
															+    return {
														
 
															+        'actions': [set_root_path],
														
 
															+    }
														
 
															+
														
 
															 # Task to create the mapping files which map folder names to ISO 3-letter country codes
														
 
															 read_config_folder = {
														
@@ -29,8 +42,9 @@ def task_map_folders():
 
															     Create or update the folder mapping in the given folder
														
 
															     """
														
 
															     return {
														
 
															-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder={read_config_folder['folder']}"],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -44,6 +58,7 @@ def task_update_bur():
 
															         'actions': ['datalad run -m "Fetch BUR submissions" '
														
 
															                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
														
 
															                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -58,9 +73,10 @@ def task_download_bur():
 
															         'actions': ['datalad run -m "Download BUR submissions" '
														
 
															                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
														
 
															                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
														
 
															-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder=downloaded_data/UNFCCC"
														
 
															                     ],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -73,6 +89,7 @@ def task_update_nc():
 
															         'actions': ['datalad run -m "Fetch NC submissions" '
														
 
															                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
														
 
															                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -87,9 +104,10 @@ def task_download_nc():
 
															         'actions': ['datalad run -m "Download NC submissions" '
														
 
															                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
														
 
															                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
														
 
															-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder=downloaded_data/UNFCCC"
														
 
															                     ],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -112,6 +130,7 @@ def task_update_annexi():
 
															                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
														
 
															                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
														
 
															                     f"--year={update_aI_config['year']}"],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -128,9 +147,10 @@ def task_download_annexi():
 
															                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
														
 
															                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
														
 
															                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
														
 
															-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder=downloaded_data/UNFCCC"
														
 
															                     ],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -141,9 +161,10 @@ def task_download_ndc():
 
															     return {
														
 
															         'actions': ['datalad run -m "Download NDC submissions" '
														
 
															                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
														
 
															-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder=downloaded_data/UNFCCC"
														
 
															                     ],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -163,9 +184,10 @@ def task_read_unfccc_submission():
 
															     return {
														
 
															         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
														
 
															                     f"--country={read_config['country']} --submission={read_config['submission']}",
														
 
															-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                     f"--folder=extracted_data/UNFCCC"
														
 
															                     ],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -188,13 +210,14 @@ def task_read_unfccc_crf_submission():
 
															         f"--country={read_config_crf['country']} "
														
 
															         f"--submission_year={read_config_crf['submission_year']} "
														
 
															         f"--submission_date={read_config_crf['submission_date']} ",
														
 
															-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															         f"--folder=extracted_data/UNFCCC"
														
 
															         ]
														
 
															     if read_config_crf["re_read"] == "True":
														
 
															         actions[0] = actions[0] + " --re_read"
														
 
															     return {
														
 
															         'actions': actions,
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -205,7 +228,7 @@ def task_read_new_unfccc_crf_for_year():
 
															     data not present yet. Only reads the latest updated submission for each country."""
														
 
															     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
														
 
															                f"--submission_year={read_config_crf['submission_year']} ",
														
 
															-               f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+               f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															                f"--folder=extracted_data/UNFCCC"
														
 
															                ]
														
 
															     # specifying countries is currently disabled duo to problems with command line
														
@@ -217,6 +240,7 @@ def task_read_new_unfccc_crf_for_year():
 
															     return {
														
 
															         #'basename': "Read_CRF_year",
														
 
															         'actions': actions,
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -235,11 +259,12 @@ def task_read_unfccc_di_for_country():
 
															         f"./venv/bin/python "
														
 
															         f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py "
														
 
															         f"--country={read_config_di['country']}",
														
 
															-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															         f"--folder=extracted_data/UNFCCC"
														
 
															         ]
														
 
															     return {
														
 
															         'actions': actions,
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -250,11 +275,12 @@ def task_process_unfccc_di_for_country():
 
															         f"./venv/bin/python "
														
 
															         f"UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py "
														
 
															         f"--country={read_config_di['country']} --date={read_config_di['date']}",
														
 
															-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
														
 
															+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
														
 
															         f"--folder=extracted_data/UNFCCC"
														
 
															         ]
														
 
															     return {
														
 
															         'actions': actions,
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }
														
@@ -266,8 +292,9 @@ def task_country_info():
 
															     """ Print information on submissions and datasets
														
 
															     available for given country"""
														
 
															     return {
														
 
															-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/country_info.py "
														
 
															+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/country_info.py "
														
 
															                     f"--country={read_config['country']}"],
														
 
															+        'task_dep': ['set_env'],
														
 
															         'verbosity': 2,
														
 
															         'setup': ['setup_venv'],
														
 
															     }