1 gadu atpakaļ · cf0d3138d3
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__
 
				 .doit.db
			
 
				 log
			
 
				 UNFCCC_GHG_data/datasets
			
 
				-UNFCCC_GHG_data/UNFCCC_DI_reader
			
 
				-
			
 
				+UNFCCC_GHG_data/UNFCCC_DI_reader/test_UNFCCC_DI_reader.ipynb
			
 
				+UNFCCC_GHG_data/UNFCCC_DI_reader/.ipynb_checkpoints/
			
 
				+*.autosave
			
 
				+#UNFCCC_GHG_data/UNFCCC_DI_reader
			
 
				 
			
--- a/DI_reading.dia
+++ b/DI_reading.dia
@@ -0,0 +1 @@
 
				+.git/annex/objects/75/Pv/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
@@ -9,20 +9,10 @@ submission are available in the downloaded data folder.
 
				 # TODO: integrate into doit
			
 
				 
			
 
				 import argparse
			
 
				-import sys
			
 
				 import primap2 as pm2
			
 
				 from pathlib import Path
			
 
				 from datetime import date
			
 
				-
			
 
				-root_path = Path(__file__).parents[2].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-#log_path = root_path / "log"
			
 
				-code_path = root_path / "UNFCCC_GHG_data"
			
 
				-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
			
 
				-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
			
 
				-dataset_path = root_path / "datasets" / "UNFCCC"
			
 
				-
			
 
				-#sys.path.append(code_path.name)
			
 
				+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
			
 
				 
			
 
				 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
			
 
				 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
			
@@ -81,7 +71,7 @@ for country in all_crf_countries:
 
				 today = date.today()
			
 
				 
			
 
				 compression = dict(zlib=True, complevel=9)
			
 
				-output_folder = dataset_path / f"CRF{submission_year}"
			
 
				+output_folder = dataset_path_UNFCCC / f"CRF{submission_year}"
			
 
				 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
			
 
				 
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
@@ -8,9 +8,7 @@ import re
 
				 import json
			
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				-import xarray as xr
			
 
				 import primap2 as pm2
			
 
				-import pycountry
			
 
				 from pathlib import Path
			
 
				 from treelib import Tree
			
 
				 from operator import itemgetter
			
@@ -18,8 +16,8 @@ from collections import Counter
 
				 from typing import Dict, List, Optional, Tuple, Union
			
 
				 from datetime import datetime, timedelta
			
 
				 from . import crf_specifications as crf
			
 
				-from .util import downloaded_data_path, NoCRFFilesError, custom_country_mapping
			
 
				-
			
 
				+from .util import NoCRFFilesError
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				 
			
 
				 ### reading functions
			
 
				 def convert_crf_table_to_pm2if(
			
@@ -568,7 +566,7 @@ def get_crf_files(
 
				     # we should only have files for one country and submission in the folder. But the
			
 
				     # function can also be used on a given folder and then the filter is useful.
			
 
				     if folder is None:
			
 
				-        data_folder = downloaded_data_path
			
 
				+        data_folder = downloaded_data_path_UNFCCC
			
 
				         submission_folder = f"CRF{submission_year}"
			
 
				 
			
 
				         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
			
@@ -935,7 +933,7 @@ def get_latest_date_for_country(
 
				         str: string with date
			
 
				     """
			
 
				 
			
 
				-    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
			
 
				+    with open(downloaded_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
			
 
				         folder_mapping = json.load(mapping_file)
			
 
				 
			
 
				     if country_code in folder_mapping:
			
@@ -946,12 +944,12 @@ def get_latest_date_for_country(
 
				         if isinstance(country_folders, str):
			
 
				             # only one folder
			
 
				             submission_date = find_latest_date(get_submission_dates(
			
 
				-                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
			
 
				+                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
			
 
				         else:
			
 
				             dates = []
			
 
				             for folder in country_folders:
			
 
				                 dates = dates + get_submission_dates(
			
 
				-                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
			
 
				+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}", file_filter)
			
 
				             submission_date = find_latest_date(dates)
			
 
				     else:
			
 
				         raise ValueError(f"No data folder found for country {country_code}. "
			
@@ -1022,7 +1020,7 @@ def get_submission_parties(
 
				                          f"the function's purpose is to return available parties.")
			
 
				 
			
 
				     if folder.exists():
			
 
				-        files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
			
 
				+        files = filter_filenames(list(folder.glob("*.xlsx")), **file_filter)
			
 
				     else:
			
 
				         raise ValueError(f"Folder {folder} does not exist")
			
 
				 
			
@@ -1034,6 +1032,7 @@ def get_submission_parties(
 
				 
			
 
				 def find_latest_date(
			
 
				         dates: List[str],
			
 
				+        date_format: str='%d%m%Y',
			
 
				 )-> str:
			
 
				     """
			
 
				     Returns the latest date in a list of dates as str in the format
			
@@ -1050,26 +1049,11 @@ def find_latest_date(
 
				     """
			
 
				 
			
 
				     if len(dates) > 0:
			
 
				-        dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
			
 
				+        dates_datetime = [[date, datetime.strptime(date, date_format)] for date in
			
 
				+                          dates]
			
 
				         dates_datetime = sorted(dates_datetime, key=itemgetter(1))
			
 
				     else:
			
 
				         raise ValueError(f"Passed list of dates is empty")
			
 
				 
			
 
				     return dates_datetime[-1][0]
			
 
				 
			
 
				-
			
 
				-def get_country_name(
			
 
				-        country_code: str,
			
 
				-) -> str:
			
 
				-    """get country name from UNFCCC_GHG_data """
			
 
				-    if country_code in custom_country_mapping:
			
 
				-        country_name = custom_country_mapping[country_code]
			
 
				-    else:
			
 
				-        try:
			
 
				-            country = pycountry.countries.get(alpha_3=country_code)
			
 
				-            country_name = country.name
			
 
				-        except:
			
 
				-            raise ValueError(f"Country UNFCCC_GHG_data {country_code} can not be mapped to "
			
 
				-                             f"any country")
			
 
				-
			
 
				-    return country_name
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
@@ -13,11 +13,9 @@ from typing import List, Optional
 
				 from pathlib import Path
			
 
				 from datetime import date
			
 
				 
			
 
				-
			
 
				 from .util import all_crf_countries
			
 
				-from .util import log_path
			
 
				+from UNFCCC_GHG_data.helper import log_path, get_country_name
			
 
				 from . import crf_specifications as crf
			
 
				-from .UNFCCC_CRF_reader_core import get_country_name
			
 
				 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
			
 
				 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
			
 
				 
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
@@ -20,18 +20,16 @@ from .UNFCCC_CRF_reader_core import read_crf_table
 
				 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
			
 
				 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
			
 
				 from .UNFCCC_CRF_reader_core import get_crf_files
			
 
				-from .UNFCCC_CRF_reader_core import get_country_name
			
 
				 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
			
 
				 from .UNFCCC_CRF_reader_devel import save_last_row_info
			
 
				 
			
 
				-from .util import code_path, log_path, \
			
 
				-    custom_country_mapping, extracted_data_path, root_path, \
			
 
				-    all_crf_countries, NoCRFFilesError
			
 
				+from UNFCCC_GHG_data.helper import code_path, log_path, root_path
			
 
				+from UNFCCC_GHG_data.helper import custom_country_mapping, extracted_data_path_UNFCCC
			
 
				+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
			
 
				+from .util import all_crf_countries, NoCRFFilesError
			
 
				 
			
 
				 #import sys
			
 
				 #sys.path.append(code_path.name)
			
 
				-from ..UNFCCC_reader import get_country_code
			
 
				-
			
 
				 
			
 
				 # functions:
			
 
				 # * testing fucntions
			
@@ -42,8 +40,6 @@ from ..UNFCCC_reader import get_country_code
 
				 
			
 
				 # TODO: add function to read several / all countries
			
 
				 
			
 
				-
			
 
				-
			
 
				 # general approach:
			
 
				 # main UNFCCC_GHG_data in a function that reads on table from one file.
			
 
				 # return raw pandas DF for use in different functions
			
@@ -188,7 +184,7 @@ def read_crf_for_country(
 
				 
			
 
				         if save_data:
			
 
				             compression = dict(zlib=True, complevel=9)
			
 
				-            output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+            output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
			
 
				             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
			
 
				 
			
 
				             if not output_folder.exists():
			
@@ -415,11 +411,6 @@ def read_new_crf_for_year_datalad(
 
				     )
			
 
				 
			
 
				 
			
 
				-# function to read all available data (or list of countries?)
			
 
				-# make sure it works when not all countries have submitted data
			
 
				-# give option to only read new data (no output yet), but also option to
			
 
				-# read all data, e.g. when specifications have changed
			
 
				-
			
 
				 def get_input_and_output_files_for_country(
			
 
				         country: str,
			
 
				         submission_year: int,
			
@@ -481,7 +472,7 @@ def get_input_and_output_files_for_country(
 
				     country_info["input"] = input_files
			
 
				 
			
 
				     # get output file
			
 
				-    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
			
 
				     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
			
 
				                                     f"_{submission_date}.{suffix}" for suffix
			
 
				                     in ['yaml', 'csv', 'nc']]
			
@@ -510,7 +501,7 @@ def submission_has_been_read(
 
				     """
			
 
				     Check if a CRF submission has already been read
			
 
				     """
			
 
				-    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
			
 
				     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
			
 
				     if output_folder.exists():
			
 
				         existing_files = output_folder.glob(f"{output_filename}.*")
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/__init__.py
@@ -6,5 +6,8 @@ CRF reader module
 
				 from . import crf_specifications
			
 
				 from .UNFCCC_CRF_reader_prod import read_crf_for_country, read_crf_for_country_datalad
			
 
				 
			
 
				-__all__ = ["crf_specifications", "read_crf_for_country", "read_crf_for_country_datalad"]
			
 
				+__all__ = ["crf_specifications",
			
 
				+           "read_crf_for_country",
			
 
				+           "read_crf_for_country_datalad",
			
 
				+           ]
			
 
				 
			
--- a/UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
+++ b/UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
@@ -1,23 +1,3 @@
 
				-from pathlib import Path
			
 
				-
			
 
				-# 4 for use from nbs, fix
			
 
				-root_path = Path(__file__).parents[2].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-log_path = root_path / "log"
			
 
				-code_path = root_path / "UNFCCC_GHG_data"
			
 
				-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
			
 
				-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
			
 
				-
			
 
				-# TODO: move this to a more general location as we can't import it
			
 
				-# to get_submissions_info
			
 
				-custom_country_mapping = {
			
 
				-    "EUA": "European Union",
			
 
				-    "EUC": "European Union",
			
 
				-    "FRK": "France",
			
 
				-    "DKE": "Denmark",
			
 
				-    "DNM": "Denmark",
			
 
				-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
			
 
				-}
			
 
				 
			
 
				 all_crf_countries = [
			
 
				     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_config.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_config.py
@@ -0,0 +1,474 @@
 
				+di_query_filters = [
			
 
				+    'classifications', 'measures', 'gases',
			
 
				+]
			
 
				+# category, party are extra
			
 
				+# measure is preprocessed to find ids
			
 
				+
			
 
				+# the activity data and emissions factors have a structure that is incompatible
			
 
				+# with PRIMAP2.
			
 
				+# To read it into a primap2 dataframe the information in classification / measure
			
 
				+# has to be put into "entity" which is currently always "No gas". I's possible,
			
 
				+# but takes some time, so I have omitted it here
			
 
				+filter_activity_factors = {
			
 
				+    "entity": {"gas": ["No gas"]},
			
 
				+    "unit": {"unit": [
			
 
				+        'no unit', 'kg/TJ', 't/TJ', '%', 'kg/t',
			
 
				+        'kg/kt', 't/t', 'kg/head/year', 'kg N2O/kg N handled', 'kg N2O/kg N',
			
 
				+        'kg N2O-N/kg N handled', 'g/m^2', 'kg N2O-N/kg N', 'kg N2O-N/ha', 'kg/t dm',
			
 
				+        't CO2-C/t', 't/unit', 't C/ha', 'kg CH4/ha', 'kg CO2/ha',
			
 
				+        'g/kg', 'kg/kg DC',
			
 
				+    ]
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+# regular expression to match category code in category label
			
 
				+cat_code_regexp = r'(?P<code>^(([0-9][A-Za-z0-9\.]{0,10}[0-9A-Za-z]))|([0-9]))[' \
			
 
				+                  r'\s\.].*'
			
 
				+
			
 
				+# PRIMAP2 interchange format config
			
 
				+di_to_pm2if_template_nai = {
			
 
				+    "coords_cols": {
			
 
				+        "category": "category",
			
 
				+        "entity": "gas",
			
 
				+        "unit": "unit",
			
 
				+        "area": "party",
			
 
				+        "sec_cats__class": "classification",
			
 
				+        "sec_cats__measure": "measure",
			
 
				+        "data": "numberValue",
			
 
				+        "time": "year",
			
 
				+    },
			
 
				+    # to store the original category name as well as the one mapped to IPCC categories
			
 
				+    "add_coords_cols": {
			
 
				+        "orig_cat_name": ["category_copy", "category"],
			
 
				+    },
			
 
				+    # terminologies for different coordinates
			
 
				+    "coords_terminologies": {
			
 
				+        "area": "ISO3",
			
 
				+        "scenario": "Access_Date",
			
 
				+        "category": "BURDI",
			
 
				+    },
			
 
				+    # default values for coordinates
			
 
				+    "coords_defaults": {
			
 
				+        "provenance": "measured",
			
 
				+        "source": "UNFCCC",
			
 
				+    },
			
 
				+    # mapping of values e.g. gases to the primap2 format
			
 
				+    "coords_value_mapping": {
			
 
				+        "entity": {
			
 
				+            "Aggregate GHGs (SARGWP100)": "KYOTOGHG (SARGWP100)",
			
 
				+            "Aggregate F-gases (SARGWP100)": "FGASES (SARGWP100)",
			
 
				+            "HFCs (SARGWP100)": "HFCS (SARGWP100)",
			
 
				+            "PFCs (SARGWP100)": "PFCS (SARGWP100)",
			
 
				+            #"SF6 (SARGWP100)": "SF6 (SARGWP100)",
			
 
				+            #"CH4 (SARGWP100)": "CH4 (SARGWP100)",
			
 
				+            "CO2 (SARGWP100)": "CO2",
			
 
				+            #"N2O (SARGWP100)": "N2O (SARGWP100)",
			
 
				+            #"Unspecified mix of HFCs and PFCs (SARGWP100)":
			
 
				+            #    "UnspMixOfHFCsPFCs (SARGWP100)",
			
 
				+            "Unspecified mix of HFCs (SARGWP100)": "UnspMixOfHFCs (SARGWP100)",
			
 
				+            "Unspecified mix of PFCs (SARGWP100)": "UnspMixOfPFCs (SARGWP100)",
			
 
				+            "HFC-23": "HFC23",
			
 
				+            "HFC-32": "HFC32",
			
 
				+            "HFC-41": "HFC41",
			
 
				+            "HFC-43-10mee": "HFC4310mee",
			
 
				+            "HFC-125": "HFC125",
			
 
				+            "HFC-134": "HFC134",
			
 
				+            "HFC-134a": "HFC134a",
			
 
				+            "HFC-143": "HFC143",
			
 
				+            "HFC-143a": "HFC143a",
			
 
				+            "HFC-152": "HFC152",
			
 
				+            "HFC-152a": "HFC152a",
			
 
				+            "HFC-161": "HFC161",
			
 
				+            "HFC-227ea": "HFC227ea",
			
 
				+            "HFC-236ea": "HFC236ea",
			
 
				+            "HFC-236cb": "HFC236cb",
			
 
				+            "HFC-236fa": "HFC236fa",
			
 
				+            "HFC-245ca": "HFC245ca",
			
 
				+            "HFC-245fa": "HFC245fa",
			
 
				+            "HFC-365mfc": "HFC365mfc",
			
 
				+            "c-C4F8": "cC4F8",
			
 
				+            "c-C3F6": "cC3F6",
			
 
				+        },
			
 
				+        "unit": "PRIMAP1",
			
 
				+        "category": {
			
 
				+            # NAI
			
 
				+            "Total GHG emissions excluding LULUCF/LUCF": "15163",
			
 
				+            "Total GHG emissions including LULUCF/LUCF": "24540",
			
 
				+            "International Bunkers": "14637",
			
 
				+            "Marine": "14423",
			
 
				+            "Aviation": "14424",
			
 
				+            "CO₂ Emissions from Biomass": "14638",
			
 
				+        }
			
 
				+    },
			
 
				+    # fill missing data from other columns (not needed here)
			
 
				+    "coords_value_filling": {
			
 
				+    },
			
 
				+    # remove data based on filters
			
 
				+    "filter_remove": {
			
 
				+    },
			
 
				+    # keep only the data defined in the filters
			
 
				+    "filter_keep": {
			
 
				+    },
			
 
				+    # define meta data
			
 
				+    "meta_data": {
			
 
				+        "references": "https://di.unfccc.int",
			
 
				+        "title": "XXXX", # to set per country
			
 
				+        "comment": "Data read from the UNFCCC DI flexible query interface using the API.",
			
 
				+        "rights": "",
			
 
				+        "contact": "mail@johannes-guetschow.de",
			
 
				+        "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
			
 
				+    },
			
 
				+    # time format used in the input data
			
 
				+    "time_format": "%Y",
			
 
				+}
			
 
				+
			
 
				+di_to_pm2if_template_ai = {
			
 
				+    "coords_cols": {
			
 
				+        "category": "category",
			
 
				+        "entity": "gas",
			
 
				+        "unit": "unit",
			
 
				+        "area": "party",
			
 
				+        "sec_cats__class": "classification",
			
 
				+        "sec_cats__measure": "measure",
			
 
				+        "data": "numberValue",
			
 
				+        "time": "year",
			
 
				+    },
			
 
				+    # to store the original category name as well as the one mapped to IPCC categories
			
 
				+    "add_coords_cols": {
			
 
				+        #"orig_cat_name": ["category_copy", "category"],
			
 
				+    },
			
 
				+    # terminologies for different coordinates
			
 
				+    "coords_terminologies": {
			
 
				+        "area": "ISO3",
			
 
				+        "scenario": "Access_Date",
			
 
				+        "category": "CRFDI",
			
 
				+    },
			
 
				+    # default values for coordinates
			
 
				+    "coords_defaults": {
			
 
				+        "provenance": "measured",
			
 
				+        "source": "UNFCCC",
			
 
				+    },
			
 
				+    # mapping of values e.g. gases to the primap2 format
			
 
				+    "coords_value_mapping": {
			
 
				+        "entity": {
			
 
				+            "Aggregate F-gases (AR4GWP100)": "FGASES (AR4GWP100)",
			
 
				+            "Aggregate GHGs (AR4GWP100)": "KYOTOGHG (AR4GWP100)",
			
 
				+            "HFCs (AR4GWP100)": "HFCS (AR4GWP100)",
			
 
				+            "PFCs (AR4GWP100)": "PFCS (AR4GWP100)",
			
 
				+            "Unspecified mix of HFCs and PFCs (AR4GWP100)":
			
 
				+                "UnspMixOfHFCsPFCs (AR4GWP100)",
			
 
				+            #"Unspecified mix of HFCs and PFCs":
			
 
				+            #    "UnspMixOfHFCsPFCs", # this is problematic, mixes should use CO2eq
			
 
				+            # with GWP
			
 
				+            "Unspecified mix of HFCs (AR4GWP100)": "UnspMixOfHFCs (AR4GWP100)",
			
 
				+            "Unspecified mix of PFCs (AR4GWP100)": "UnspMixOfPFCs (AR4GWP100)",
			
 
				+            "HFC-23": "HFC23",
			
 
				+            "HFC-32": "HFC32",
			
 
				+            "HFC-41": "HFC41",
			
 
				+            "HFC-43-10mee": "HFC4310mee",
			
 
				+            "HFC-125": "HFC125",
			
 
				+            "HFC-134": "HFC134",
			
 
				+            "HFC-134a": "HFC134a",
			
 
				+            "HFC-143": "HFC143",
			
 
				+            "HFC-143a": "HFC143a",
			
 
				+            "HFC-152": "HFC152",
			
 
				+            "HFC-152a": "HFC152a",
			
 
				+            "HFC-161": "HFC161",
			
 
				+            "HFC-227ea": "HFC227ea",
			
 
				+            "HFC-236ea": "HFC236ea",
			
 
				+            "HFC-236cb": "HFC236cb",
			
 
				+            "HFC-236fa": "HFC236fa",
			
 
				+            "HFC-245ca": "HFC245ca",
			
 
				+            "HFC-245fa": "HFC245fa",
			
 
				+            "HFC-365mfc": "HFC365mfc",
			
 
				+            "c-C4F8": "cC4F8",
			
 
				+            "c-C3F6": "cC3F6",
			
 
				+        },
			
 
				+        "unit": "PRIMAP1",
			
 
				+        "category": {
			
 
				+            'Annual Change in Total Long-term C Storage': "11024",
			
 
				+            'Annual Change in Total Long-term C Storage in HWP Waste': "11025",
			
 
				+            'HWP in SWDS': "11036",
			
 
				+            'International Aviation': "10357",
			
 
				+            'International Navigation': "8828",
			
 
				+            'Long-term Storage of C in Waste Disposal Sites': "temp",
			
 
				+            'CO₂ Emissions from Biomass': "8270",
			
 
				+            'International Bunkers': "8564",
			
 
				+            'Multilateral Operations': "8987",
			
 
				+            'Total Amount Captured for Storage': "11030",
			
 
				+            'Total Amount of CO₂ Injected at Storage Sites': "11033",
			
 
				+            'Total Amount of Exports for Storage': "11032",
			
 
				+            'Total Amount of Imports for Storage': "11031",
			
 
				+            'Total GHG emissions with LULUCF': "8677",
			
 
				+            'Total GHG emissions with LULUCF including indirect CO₂': "10480",
			
 
				+            'Total GHG emissions without LULUCF': "10464",
			
 
				+            'Total GHG emissions without LULUCF including indirect CO₂': "10479",
			
 
				+            'Total Leakage from Transport, Injection and Storage': "11034",
			
 
				+            'Waste Incineration with Energy Recovery included as Biomass': "11027",
			
 
				+            'Waste Incineration with Energy Recovery included as Fossil Fuels':
			
 
				+                "11028",
			
 
				+        }
			
 
				+    },
			
 
				+    # fill missing data from other columns (not needed here)
			
 
				+    "coords_value_filling": {
			
 
				+    },
			
 
				+    # remove data based on filters
			
 
				+    "filter_remove": {
			
 
				+        # some upsecified mixes not reported in CO2eq have tonbe removed
			
 
				+        "entity_wrong_unit": {
			
 
				+            "gas": ["Unspecified mix of HFCs and PFCs"]
			
 
				+        },
			
 
				+        # remove data that is not for a gas (partly it currently can't be read and
			
 
				+        # partly because the dataset is too large because of the many dimensions)
			
 
				+        "entity_no_gas": {
			
 
				+            "gas": ["No gas"]
			
 
				+        },
			
 
				+    },
			
 
				+    # keep only the data defined in the filters
			
 
				+    "filter_keep": {
			
 
				+        "only_emission_measures": {
			
 
				+            "measure": [
			
 
				+                'Net carbon emissions',
			
 
				+                'Net emissions/removals',
			
 
				+                'Emissions from disposal',
			
 
				+                'Emissions from manufacturing',
			
 
				+                'Emissions from stocks',
			
 
				+                'Indirect emissions',
			
 
				+                'Direct emissions per MMS',
			
 
				+                'Direct emissions per MMS - Anaerobic lagoon',
			
 
				+                'Direct emissions per MMS - Composting',
			
 
				+                'Direct emissions per MMS - Daily spread',
			
 
				+                'Direct emissions per MMS - Digesters',
			
 
				+                'Direct emissions per MMS - Liquid system',
			
 
				+                'Direct emissions per MMS - Other',
			
 
				+                'Direct emissions per MMS - Solid storage and dry lot',
			
 
				+                'Indirect N2O emissions from atmospheric deposition',
			
 
				+                'Indirect N2O emissions from nitrogen leaching and run-off',
			
 
				+                'Net emissions/removals from HWP from domestic harvest',
			
 
				+            ],
			
 
				+        },
			
 
				+    },
			
 
				+    # define meta data
			
 
				+    "meta_data": {
			
 
				+        "references": "https://di.unfccc.int",
			
 
				+        "title": "XXXX", # to set per country
			
 
				+        "comment": "Data read from the UNFCCC DI flexible query interface using the API.",
			
 
				+        "rights": "",
			
 
				+        "contact": "mail@johannes-guetschow.de",
			
 
				+        "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
			
 
				+    },
			
 
				+    # time format used in the input data
			
 
				+    "time_format": "%Y",
			
 
				+}
			
 
				+
			
 
				+cat_conversion = {
			
 
				+    # ANNEXI to come (low priority as we read from CRF files)
			
 
				+    'BURDI_to_IPCC2006_PRIMAP': {
			
 
				+        'mapping': {
			
 
				+            '1': '1',
			
 
				+            '1.A': '1.A',
			
 
				+            '1.A.1': '1.A.1',
			
 
				+            '1.A.2': '1.A.2',
			
 
				+            '1.A.3': '1.A.3',
			
 
				+            '1.A.4': '1.A.4',
			
 
				+            '1.A.5': '1.A.5',
			
 
				+            '1.B': '1.B',
			
 
				+            '1.B.1': '1.B.1',
			
 
				+            '1.B.2': '1.B.2',
			
 
				+            '2.A': '2.A',
			
 
				+            '2.B': 'M.2.B_2.B',
			
 
				+            '2.C': '2.C',
			
 
				+            '2.D': 'M.2.H.1_2',
			
 
				+            '2.E': 'M.2.B_2.E',
			
 
				+            '2.F': '2.F',
			
 
				+            '2.G': '2.H.3',
			
 
				+            '4': 'M.AG',
			
 
				+            '4.A': '3.A.1',
			
 
				+            '4.B': '3.A.2',
			
 
				+            '4.C': '3.C.7',
			
 
				+            '4.D': 'M.3.C.45.AG',
			
 
				+            '4.E': '3.C.1.c',
			
 
				+            '4.F': '3.C.1.b',
			
 
				+            '4.G': '3.C.8',
			
 
				+            '5': 'M.LULUCF',
			
 
				+            '6': '4',
			
 
				+            '6.A': '4.A',
			
 
				+            '6.B': '4.D',
			
 
				+            '6.C': '4.C',
			
 
				+            '6.D': '4.E',
			
 
				+            '24540': '0',
			
 
				+            '15163': 'M.0.EL',
			
 
				+            '14637': 'M.BK',
			
 
				+            '14424': 'M.BK.A',
			
 
				+            '14423': 'M.BK.M',
			
 
				+            '14638': 'M.BIO',
			
 
				+            '7': '5',
			
 
				+        }, #5.A-D ignored as not fitting 2006 cats
			
 
				+        'aggregate': {
			
 
				+            '2.B': {'sources': ['M.2.B_2.B', 'M.2.B_2.E'], 'name': 'Chemical Industry'},
			
 
				+            '2.H': {'sources': ['M.2.H.1_2', '2.H.3'], 'name': 'Other'},
			
 
				+            '2': {'sources': ['2.A', '2.B', '2.C', '2.F', '2.H'],
			
 
				+                  'name': 'Industrial Processes and Product Use'},
			
 
				+            '3.A': {'sources': ['3.A.1', '3.A.2'], 'name': 'Livestock'},
			
 
				+            '3.C.1': {'sources': ['3.C.1.b', '3.C.1.c'],
			
 
				+                         'name': 'Emissions from biomass burning'},
			
 
				+            'M.3.C.1.AG': {'sources': ['3.C.1.b', '3.C.1.c'],
			
 
				+                         'name': 'Emissions from biomass burning (Agriculture)'},
			
 
				+            '3.C': {'sources': ['3.C.1', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
			
 
				+                         'name': 'Aggregate sources and non-CO2 emissions sources on land'},
			
 
				+            'M.3.C.AG': {'sources': ['M.3.C.1.AG', 'M.3.C.45.AG', '3.C.7', '3.C.8'],
			
 
				+                         'name': 'Aggregate sources and non-CO2 emissions sources on land ('
			
 
				+                                 'Agriculture)'},
			
 
				+            'M.AG.ELV': {'sources': ['M.3.C.AG'], 'name': 'Agriculture excluding livestock'},
			
 
				+        },
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+di_processing_templates = {
			
 
				+    # templates fro the DI processing. Most processing rules will apply to several
			
 
				+    # versions. So we store them here and refer to them in the processing info dict
			
 
				+    'BFA': {
			
 
				+        'DI2022-08-22': { # remove 2007, seems to have summed sectors (Agri and LULUCF)
			
 
				+            # and missing sectors (e.g. 1,2 for CH4, N2O)
			
 
				+            'remove_years': ['2007'],
			
 
				+        },
			
 
				+    },
			
 
				+    'BIH': {
			
 
				+        'DI2022-08-22': {
			
 
				+            # downscaling in two steps
			
 
				+            # 1990-2001 has different coverage than 2002-2012 and 2013-2014
			
 
				+            # do not downscale KyotoGHG for 1990-2001 as that's aggregated
			
 
				+            # later to avoid inconsistencies
			
 
				+            'downscale': {
			
 
				+                'sectors': {
			
 
				+                    '1.A_1990': {
			
 
				+                        'basket': '1.A',
			
 
				+                        'basket_contents': ['1.A.1', '1.A.2', '1.A.3', '1.A.4',
			
 
				+                                            '1.A.5'],
			
 
				+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                    '1.B_1990': {
			
 
				+                        'basket': '1.B',
			
 
				+                        'basket_contents': ['1.B.1', '1.B.2'],
			
 
				+                        'entities': ['CH4', 'CO2', 'NMVOC', 'SO2'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                    '2_1990': {
			
 
				+                        'basket': '2',
			
 
				+                        'basket_contents': ['2.A', '2.B', '2.C', '2.D'],
			
 
				+                        'entities': ['CH4', 'CO2', 'N2O', 'CO', 'NMVOC', 'NOx', 'SO2'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                    '4_1990': {
			
 
				+                        'basket': '4',
			
 
				+                        'basket_contents': ['4.A', '4.B', '4.C', '4.D', '4.E'],
			
 
				+                        'entities': ['CH4', 'N2O'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                    '5_1990': {
			
 
				+                        'basket': '5',
			
 
				+                        'basket_contents': ['5.A'],
			
 
				+                        'entities': ['CO2'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                    '6_1990': {
			
 
				+                        'basket': '6',
			
 
				+                        'basket_contents': ['6.A'],
			
 
				+                        'entities': ['CH4'],
			
 
				+                        'dim': 'category (BURDI)',
			
 
				+                        'sel': {'time': ['1990', '1991', '1992', '1993', '1994',
			
 
				+                                         '1995', '1996', '1997', '1998', '1999',
			
 
				+                                         '2000', '2001']},
			
 
				+                        'skipna_evaluation_dims': None,
			
 
				+                        'skipna': True,
			
 
				+                    },
			
 
				+                },
			
 
				+                'entities': { # 2002-2014
			
 
				+                    'KYOTO': {
			
 
				+                        'basket': 'KYOTOGHG (SARGWP100)',
			
 
				+                        'basket_contents': ['CH4', 'CO2', 'N2O'],
			
 
				+                        'sel': {'category (BURDI)':
			
 
				+                                    ['1' ,'1.A' ,'1.A.1', '1.A.2', '1.A.3', '1.A.4',
			
 
				+                                     '1.A.5', '1.B', '1.B.1', '1.B.2', '2', '2.A',
			
 
				+                                     '2.B', '2.C', '2.D', '2.E', '4', '4.A', '4.B',
			
 
				+                                     '4.C', '4.D', '4.E', '5', '5.A', '6', '6.A',
			
 
				+                                     '6.B', '6.C', '14423', '14424', '14637',
			
 
				+                                     '15163', '24540',
			
 
				+                                     ],
			
 
				+                                'time': ['2002', '2003', '2004', '2005', '2006',
			
 
				+                                         '2007', '2008', '2009', '2010', '2011',
			
 
				+                                         '2012', '2013', '2014'],
			
 
				+                                },
			
 
				+                    },
			
 
				+                },
			
 
				+            },
			
 
				+        },
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+di_processing_info = {
			
 
				+    # only countries with special processing listet
			
 
				+    # category conversion is defined on a country group level
			
 
				+    # the 'default' option is used if no specific option is found such that
			
 
				+    # processing of new versions can be done before creating a configuration for the
			
 
				+    # version.
			
 
				+    'BFA': {
			
 
				+        'default': di_processing_templates['BFA']['DI2022-08-22'],
			
 
				+        'DI2022-08-22': di_processing_templates['BFA']['DI2022-08-22'],
			
 
				+    },
			
 
				+    'BIH': {
			
 
				+        'default': di_processing_templates['BIH']['DI2022-08-22'],
			
 
				+        'DI2022-08-22': di_processing_templates['BIH']['DI2022-08-22'],
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+gas_baskets = {
			
 
				+    'HFCS (SARGWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
			
 
				+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
			
 
				+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
			
 
				+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'OTHERHFCS (SARGWP100)'],
			
 
				+    'HFCS (AR4GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
			
 
				+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
			
 
				+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
			
 
				+                         'HFC407c', 'HFC410a', 'HFC4310mee', 'Unspecified mix of HFCs (AR4GWP100)'],
			
 
				+    'HFCS (AR5GWP100)': ['HFC23', 'HFC32', 'HFC41', 'HFC125', 'HFC134',
			
 
				+                         'HFC134a', 'HFC143',  'HFC143a', 'HFC152a', 'HFC227ea',
			
 
				+                         'HFC236fa', 'HFC245ca', 'HFC245fa', 'HFC365mfc',  'HFC404a',
			
 
				+                         'HFC407c', 'HFC410a', 'HFC4310mee'],
			
 
				+    'PFCS (SARGWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
			
 
				+    'PFCS (AR4GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8',  'Unspecified mix of PFCs (AR4GWP100)'],
			
 
				+    'PFCS (AR5GWP100)': ['C3F8', 'C4F10', 'CF4', 'C2F6', 'C6F14', 'C5F12', 'cC4F8'],
			
 
				+    'FGASES (SARGWP100)': ['HFCS (SARGWP100)', 'PFCS (SARGWP100)', 'SF6', 'NF3'],
			
 
				+    'FGASES (AR4GWP100)': ['HFCS (AR4GWP100)', 'PFCS (AR4GWP100)', 'SF6', 'NF3'],
			
 
				+    'FGASES (AR5GWP100)': ['HFCS (AR5GWP100)', 'PFCS (AR5GWP100)', 'SF6', 'NF3'],
			
 
				+    'KYOTOGHG (SARGWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (SARGWP100)', 'PFCS (SARGWP100)'],
			
 
				+    'KYOTOGHG (AR4GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR4GWP100)', 'PFCS (AR4GWP100)',
			
 
				+                             'Unspecified mix of HFCs (AR4GWP100)', 'Unspecified mix of PFCs (AR4GWP100)'],
			
 
				+    'KYOTOGHG (AR5GWP100)': ['CO2', 'CH4', 'N2O', 'SF6', 'NF3', 'HFCS (AR5GWP100)', 'PFCS (AR5GWP100)'],
			
 
				+}
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
@@ -0,0 +1,1407 @@
 
				+import primap2 as pm2
			
 
				+import unfccc_di_api
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import pycountry
			
 
				+import itertools
			
 
				+import json
			
 
				+import copy
			
 
				+import xarray as xr
			
 
				+import datalad.api
			
 
				+import re
			
 
				+from datalad.support.exceptions import IncompleteResultsError
			
 
				+from datetime import date
			
 
				+from typing import Optional, Dict, List, Union
			
 
				+from pathlib import Path
			
 
				+from copy import deepcopy
			
 
				+from dask.base import tokenize
			
 
				+
			
 
				+from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_core import find_latest_date
			
 
				+
			
 
				+from .UNFCCC_DI_reader_config import di_to_pm2if_template_nai
			
 
				+from .UNFCCC_DI_reader_config import di_to_pm2if_template_ai
			
 
				+from .UNFCCC_DI_reader_config import di_query_filters
			
 
				+from .UNFCCC_DI_reader_config import di_processing_info
			
 
				+from .UNFCCC_DI_reader_config import cat_conversion
			
 
				+from .UNFCCC_DI_reader_config import gas_baskets
			
 
				+from .UNFCCC_DI_reader_config import cat_code_regexp
			
 
				+from .util import NoDIDataError, nAI_countries, AI_countries
			
 
				+from .util import DI_date_format, regex_date
			
 
				+
			
 
				+from UNFCCC_GHG_data.helper import custom_country_mapping
			
 
				+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
			
 
				+from UNFCCC_GHG_data.helper import extracted_data_path_UNFCCC, root_path, code_path
			
 
				+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
			
 
				+from UNFCCC_GHG_data.helper import convert_categories
			
 
				+
			
 
				+
			
 
				+def read_UNFCCC_DI_for_country(
			
 
				+        country_code: str,
			
 
				+        category_groups: Optional[Dict]=None,
			
 
				+        read_subsectors: bool=False,
			
 
				+        save_data: Optional[bool]=True,
			
 
				+        date_str: Optional[str]=None,
			
 
				+        pm2if_specifications: Optional[dict]=None,
			
 
				+        default_gwp: Optional[str]=None,
			
 
				+        debug: Optional[bool]=False,
			
 
				+):
			
 
				+    """
			
 
				+    reads data for a country from the UNFCCC DI interface and saves to native and
			
 
				+    interchange format
			
 
				+    """
			
 
				+
			
 
				+    # read the data
			
 
				+    data_df = read_UNFCCC_DI_for_country_df(
			
 
				+        country_code=country_code,
			
 
				+        category_groups=category_groups,
			
 
				+        read_subsectors=read_subsectors,
			
 
				+        debug=debug,
			
 
				+    )
			
 
				+
			
 
				+    # set date_str if not given
			
 
				+    if date_str is None:
			
 
				+        today = date.today()
			
 
				+        date_str = today.strftime(DI_date_format)
			
 
				+
			
 
				+    # convert raw data to pm2 interchange format and save
			
 
				+    data_if = convert_DI_data_to_pm2_if(
			
 
				+        data=data_df,
			
 
				+        pm2if_specifications=deepcopy(pm2if_specifications),
			
 
				+        default_gwp=default_gwp,
			
 
				+        date_str=date_str,
			
 
				+        debug=debug,
			
 
				+    )
			
 
				+
			
 
				+    # convert raw data to native pm2 format and save that
			
 
				+    data_pm2 = convert_DI_IF_data_to_pm2(
			
 
				+        data_di_if=data_if,
			
 
				+    )
			
 
				+
			
 
				+    # save
			
 
				+    if save_data:
			
 
				+        save_DI_country_data(data_pm2, raw=True)
			
 
				+
			
 
				+    return data_pm2
			
 
				+
			
 
				+
			
 
				+def process_and_save_UNFCCC_DI_for_country(
			
 
				+        country_code: str,
			
 
				+        date_str: Union[str, None]=None,
			
 
				+) -> xr.Dataset:
			
 
				+    '''
			
 
				+    process data and save them to disk using default parameters
			
 
				+    '''
			
 
				+
			
 
				+    # get latest dataset if no date given
			
 
				+    if date_str is None:
			
 
				+        # get the latest date
			
 
				+        raw_data_file = find_latest_DI_data(country_code, raw=True)
			
 
				+    else:
			
 
				+        raw_data_file = determine_filename(country_code, date_str, raw=True,
			
 
				+                                           hash=False)
			
 
				+
			
 
				+        raw_data_file = raw_data_file.parent / (raw_data_file.name + '.nc')
			
 
				+        print(f"process {raw_data_file.name}")
			
 
				+        if not raw_data_file.exists():
			
 
				+            raise ValueError(f"File {raw_data_file.name} does not exist. Check if it "
			
 
				+                             "has been read.")
			
 
				+
			
 
				+    # load the data
			
 
				+    data_to_process = pm2.open_dataset(raw_data_file)
			
 
				+
			
 
				+    # get parameters
			
 
				+    countries = list(data_to_process.coords[data_to_process.attrs['area']].values)
			
 
				+    if len(countries) > 1:
			
 
				+        raise ValueError(
			
 
				+            f"Found {len(countries)} countries. Only single country data "
			
 
				+            f"can be processed by this function. countries: {countries}")
			
 
				+    else:
			
 
				+        country_code = countries[0]
			
 
				+    processing_info_country = di_processing_info[country_code]
			
 
				+    entities_to_ignore = [] # TODO: check and make default list
			
 
				+
			
 
				+    # process
			
 
				+    data_processed = process_UNFCCC_DI_for_country(
			
 
				+        data_country=data_to_process,
			
 
				+        entities_to_ignore=entities_to_ignore,
			
 
				+        gas_baskets=gas_baskets,
			
 
				+        cat_conversion=cat_conversion,
			
 
				+        sectors=None,
			
 
				+        processing_info_country=processing_info_country,
			
 
				+    )
			
 
				+
			
 
				+    # save
			
 
				+    save_DI_country_data(data_processed, raw=False)
			
 
				+
			
 
				+    return data_processed
			
 
				+
			
 
				+
			
 
				+def process_UNFCCC_DI_for_country(
			
 
				+        data_country: xr.Dataset,
			
 
				+        entities_to_ignore: List[str],
			
 
				+        gas_baskets: Dict[str, List[str]],
			
 
				+        cat_conversion: Dict[str, Dict] = None,
			
 
				+        sectors: List[str] = None,
			
 
				+        processing_info_country: Dict = None,
			
 
				+) -> xr.Dataset:
			
 
				+    """
			
 
				+        Process data from DI interface (where necessary).
			
 
				+        * Downscaling including subtraction of time series
			
 
				+        * country specific sector aggregation
			
 
				+        * Conversion to IPCC2006 categories
			
 
				+        * general sector and gas basket aggregation (in new categories)
			
 
				+    """
			
 
				+    #### 0: gather information
			
 
				+    countries = list(data_country.coords[data_country.attrs['area']].values)
			
 
				+    if len(countries) > 1:
			
 
				+        raise ValueError(
			
 
				+            f"Found {len(countries)} countries. Only single country data "
			
 
				+            f"can be processed by this function. countries: {countries}")
			
 
				+    else:
			
 
				+        country_code = countries[0]
			
 
				+
			
 
				+    cat_col = data_country.attrs['cat']
			
 
				+    temp = re.findall(r'\((.*)\)', cat_col)
			
 
				+    cat_terminology_in = temp[0]
			
 
				+
			
 
				+    #### 1: general processing
			
 
				+    # remove unused cats
			
 
				+    data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
			
 
				+    # remove unused years
			
 
				+    data_country = data_country.dropna(f'time', how='all')
			
 
				+    # remove variables only containing nan
			
 
				+    nan_vars_country = [var for var in data_country.data_vars if
			
 
				+                        data_country[var].isnull().all().data == True]
			
 
				+    data_country = data_country.drop_vars(nan_vars_country)
			
 
				+
			
 
				+    # remove unnecessary variables
			
 
				+    entities_ignore_present = [entity for entity in entities_to_ignore if
			
 
				+                               entity in data_country.data_vars]
			
 
				+    data_country = data_country.drop_vars(entities_ignore_present)
			
 
				+
			
 
				+    #### 2: country specific processing
			
 
				+
			
 
				+
			
 
				+    if processing_info_country is not None:
			
 
				+        # get scenario
			
 
				+        scenarios = list(data_country.coords[data_country.attrs['scen']].values)
			
 
				+        if len(scenarios) > 1:
			
 
				+            raise ValueError(
			
 
				+                f"Found {len(scenarios)} scenarios. Only single scenario data "
			
 
				+                f"can be processed by this function. Scenarios: {scenarios}")
			
 
				+        else:
			
 
				+            scenario = scenarios[0]
			
 
				+            if scenario in processing_info_country.keys():
			
 
				+                processing_info_country_scen = processing_info_country[scenario]
			
 
				+            else:
			
 
				+                processing_info_country_scen = processing_info_country['default']
			
 
				+
			
 
				+
			
 
				+            if 'tolerance' in processing_info_country_scen:
			
 
				+                tolerance = processing_info_country_scen["tolerance"]
			
 
				+            else:
			
 
				+                tolerance = 0.01
			
 
				+
			
 
				+            # take only desired years
			
 
				+            if 'years' in processing_info_country_scen:
			
 
				+                data_country = data_country.pr.loc[
			
 
				+                    {'time': processing_info_country_scen['years']}]
			
 
				+
			
 
				+            # remove timeseries if desired
			
 
				+            if 'remove_ts' in processing_info_country_scen:
			
 
				+                for case in processing_info_country_scen['remove_ts']:
			
 
				+                    remove_info = processing_info_country_scen['remove_ts'][case]
			
 
				+                    entities = remove_info.pop("entities")
			
 
				+                    for entity in entities:
			
 
				+                        data_country[entity].pr.loc[remove_info] = \
			
 
				+                            data_country[entity].pr.loc[remove_info] * np.nan
			
 
				+
			
 
				+            # remove all data for given years if necessary
			
 
				+            if 'remove_years' in processing_info_country_scen:
			
 
				+                data_country.pr.loc[{'time': processing_info_country_scen[
			
 
				+                    'remove_years']}] = \
			
 
				+                    data_country.pr.loc[{'time': processing_info_country_scen[
			
 
				+                        'remove_years']}] * np.nan
			
 
				+
			
 
				+            # subtract categories
			
 
				+            if 'subtract_cats' in processing_info_country_scen:
			
 
				+                subtract_cats_current = processing_info_country_scen['subtract_cats']
			
 
				+                if 'entities' in subtract_cats_current.keys():
			
 
				+                    entities_current = subtract_cats_current['entities']
			
 
				+                else:
			
 
				+                    entities_current = list(data_country.data_vars)
			
 
				+                print(f"Subtracting categories for country {country_code}, entities "
			
 
				+                      f"{entities_current}")
			
 
				+                for cat_to_generate in subtract_cats_current:
			
 
				+                    cats_to_subtract = \
			
 
				+                        subtract_cats_current[cat_to_generate]['subtract']
			
 
				+                    data_sub = \
			
 
				+                        data_country.pr.loc[{'category': cats_to_subtract}].pr.sum(
			
 
				+                        dim='category', skipna=True, min_count=1)
			
 
				+                    data_parent = data_country.pr.loc[
			
 
				+                        {'category': subtract_cats_current[cat_to_generate]['parent']}]
			
 
				+                    data_agg = data_parent - data_sub
			
 
				+                    nan_vars = [var for var in data_agg.data_vars if
			
 
				+                                data_agg[var].isnull().all().data == True]
			
 
				+                    data_agg = data_agg.drop(nan_vars)
			
 
				+                    if len(data_agg.data_vars) > 0:
			
 
				+                        print(f"Generating {cat_to_generate} through subtraction")
			
 
				+                        data_agg = data_agg.expand_dims([f'category ('
			
 
				+                                                         f'{cat_terminology_in})'])
			
 
				+                        data_agg = data_agg.assign_coords(
			
 
				+                            coords={f'category ({cat_terminology_in})':
			
 
				+                                        (f'category ({cat_terminology_in})',
			
 
				+                                         [cat_to_generate])})
			
 
				+                        data_country = data_country.pr.merge(data_agg,
			
 
				+                                                             tolerance=tolerance)
			
 
				+                    else:
			
 
				+                        print(f"no data to generate category {cat_to_generate}")
			
 
				+
			
 
				+            # downscaling
			
 
				+            if 'downscale' in processing_info_country_scen:
			
 
				+                if 'sectors' in processing_info_country_scen['downscale']:
			
 
				+                    sector_downscaling = \
			
 
				+                        processing_info_country_scen['downscale']['sectors']
			
 
				+                    for case in sector_downscaling.keys():
			
 
				+                        print(f"Downscaling for {case}.")
			
 
				+                        sector_downscaling_current = sector_downscaling[case]
			
 
				+                        entities = sector_downscaling_current.pop('entities')
			
 
				+                        for entity in entities:
			
 
				+                            data_country[entity] = data_country[
			
 
				+                                entity].pr.downscale_timeseries(
			
 
				+                                **sector_downscaling_current)
			
 
				+                            # , skipna_evaluation_dims=None)
			
 
				+
			
 
				+                if 'entities' in processing_info_country_scen['downscale']:
			
 
				+                    entity_downscaling = \
			
 
				+                        processing_info_country_scen['downscale']['entities']
			
 
				+                    for case in entity_downscaling.keys():
			
 
				+                        #print(case)
			
 
				+                        print(data_country.coords[f'category ('
			
 
				+                                                  f'{cat_terminology_in})'].values)
			
 
				+                        data_country = data_country.pr.downscale_gas_timeseries(
			
 
				+                            **entity_downscaling[case], skipna=True,
			
 
				+                            skipna_evaluation_dims=None)
			
 
				+
			
 
				+            # aggregate categories
			
 
				+            if 'aggregate_cats' in processing_info_country_scen:
			
 
				+                aggregate_cats_current = processing_info_country_scen['aggregate_cats']
			
 
				+                print(
			
 
				+                    f"Aggregating categories for country {country_code}")
			
 
				+                for cat_to_agg in aggregate_cats_current:
			
 
				+                    print(f"Category: {cat_to_agg}")
			
 
				+                    source_cats = aggregate_cats_current[cat_to_agg]['sources']
			
 
				+                    data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
			
 
				+                        dim='category', skipna=True, min_count=1)
			
 
				+                    nan_vars = [var for var in data_agg.data_vars if
			
 
				+                                data_agg[var].isnull().all().data == True]
			
 
				+                    data_agg = data_agg.drop(nan_vars)
			
 
				+                    if len(data_agg.data_vars) > 0:
			
 
				+                        data_agg = data_agg.expand_dims([f'category ('
			
 
				+                                                         f'{cat_terminology_in})'])
			
 
				+                        data_agg = data_agg.assign_coords(
			
 
				+                            coords={f'category ({cat_terminology_in})':
			
 
				+                                        (f'category ({cat_terminology_in})',
			
 
				+                                         [cat_to_agg])})
			
 
				+                        data_country = data_country.pr.merge(data_agg,
			
 
				+                                                             tolerance=tolerance)
			
 
				+                    else:
			
 
				+                        print(f"no data to aggregate category {cat_to_agg}")
			
 
				+
			
 
				+            # aggregate gases if desired
			
 
				+            if 'aggregate_gases' in processing_info_country_scen:
			
 
				+                for case in processing_info_country_scen['aggregate_gases'].keys():
			
 
				+                    case_info = processing_info_country_scen['aggregate_gases'][case]
			
 
				+                    data_country[case_info['basket']] = \
			
 
				+                        data_country.pr.fill_na_gas_basket_from_contents(
			
 
				+                            **case_info)
			
 
				+
			
 
				+    #### 3: map categories
			
 
				+    if country_code in nAI_countries:
			
 
				+        # conversion from BURDI to IPCC2006_PRIMAP needed
			
 
				+        cat_terminology_out = 'IPCC2006_PRIMAP'
			
 
				+        data_country = convert_categories(
			
 
				+            data_country,
			
 
				+            cat_conversion[f"{cat_terminology_in}_to_{cat_terminology_out}"],
			
 
				+            cat_terminology_out,
			
 
				+            debug=False,
			
 
				+            tolerance=0.01,
			
 
				+        )
			
 
				+    else:
			
 
				+        cat_terminology_out = cat_terminology_in
			
 
				+
			
 
				+    # more general processing
			
 
				+    # reduce categories to output cats
			
 
				+    if sectors is not None:
			
 
				+        cats_to_keep = [cat for cat in
			
 
				+                        data_country.coords[f'category ({cat_terminology_out})'].values if
			
 
				+                        cat in sectors]
			
 
				+        data_country = data_country.pr.loc[{'category': cats_to_keep}]
			
 
				+
			
 
				+    # create gas baskets
			
 
				+    entities_present = set(data_country.data_vars)
			
 
				+    for basket in gas_baskets.keys():
			
 
				+        basket_contents_present = [gas for gas in gas_baskets[basket] if
			
 
				+                                   gas in entities_present]
			
 
				+        if len(basket_contents_present) > 0:
			
 
				+            if basket in list(data_country.data_vars):
			
 
				+                data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
			
 
				+                    basket=basket, basket_contents=basket_contents_present, min_count=1)
			
 
				+            else:
			
 
				+                try:
			
 
				+                    data_country[basket] = xr.full_like(data_country["CO2"],
			
 
				+                                                        np.nan).pr.quantify(
			
 
				+                        units="Gg CO2 / year")
			
 
				+                    data_country[basket].attrs = {"entity": basket.split(' ')[0],
			
 
				+                                                  "gwp_context": basket.split(' ')[1][
			
 
				+                                                                 1:-1]}
			
 
				+                    data_country[basket] = data_country.pr.gas_basket_contents_sum(
			
 
				+                        basket=basket, basket_contents=basket_contents_present,
			
 
				+                        min_count=1)
			
 
				+                except:
			
 
				+                    print(f"No gas basket created for {country_code}")
			
 
				+
			
 
				+    # amend title and comment
			
 
				+    data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
			
 
				+                                                                    f"{date.today()}"
			
 
				+    data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
			
 
				+                                                                    f"{date.today()}"
			
 
				+
			
 
				+    return data_country
			
 
				+
			
 
				+
			
 
				+def read_UNFCCC_DI_for_country_df(
			
 
				+        country_code: str,
			
 
				+        category_groups: Optional[Dict]=None,
			
 
				+        read_subsectors: bool=False,
			
 
				+        debug: Optional[bool]=False,
			
 
				+)->pd.DataFrame:
			
 
				+    """
			
 
				+    read UNFCCC DI data for a given country. All data will be read
			
 
				+    including all categories, gases, measures, and classifications
			
 
				+    Filtering is done later on conversion to PRIMAP2 format
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    country_code: str
			
 
				+        ISO3 code of the country (country names don't work, use the wrapper function)
			
 
				+
			
 
				+    category_groups: dict (optional)
			
 
				+        define which categories to read including filters on classification, measure,
			
 
				+        gases
			
 
				+
			
 
				+        cat_groups = {
			
 
				+            "4.A  Enteric Fermentation": { #4.A  Enteric Fermentation[14577]
			
 
				+                "measure": [
			
 
				+                    'Net emissions/removals',
			
 
				+                    'Total population',
			
 
				+                ],
			
 
				+                "gases": ["CH4"],
			
 
				+            },
			
 
				+        }
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    pd.DataFrame with read data
			
 
				+
			
 
				+    """
			
 
				+    reader = unfccc_di_api.UNFCCCApiReader()
			
 
				+
			
 
				+    # template for the query to the DI API
			
 
				+    query_template = {
			
 
				+        "party_codes": [country_code],
			
 
				+        "normalize_gas_names": True
			
 
				+    }
			
 
				+
			
 
				+    # find country group
			
 
				+    if country_code in nAI_countries:
			
 
				+        ai_country = False
			
 
				+    elif country_code in AI_countries:
			
 
				+        ai_country = True
			
 
				+        #di_data = reader.annex_one_reader.query(**query)
			
 
				+    else:
			
 
				+        raise ValueError(f"Country code {country_code} found neither in AnnexI nor "
			
 
				+                         f"non-AnnexI country lists.")
			
 
				+
			
 
				+    if category_groups is None:
			
 
				+        # no category defs given, so use default which is all categories,
			
 
				+        # all gases, but no other data
			
 
				+        if debug:
			
 
				+            print(f"Using default config to read for country {country_code}")
			
 
				+        if ai_country:
			
 
				+            all_gases = reader.annex_one_reader.gases["name"]
			
 
				+            query = query_template
			
 
				+            query["gases"] = list(set(all_gases) - {"No gas"})
			
 
				+            if debug:
			
 
				+                print(f"Using query: {query}")
			
 
				+            di_data = reader.annex_one_reader.query(**query)
			
 
				+        else:
			
 
				+            all_gases = reader.non_annex_one_reader.gases["name"]
			
 
				+            query = query_template
			
 
				+            query["gases"] = list(set(all_gases) - {"No gas"})
			
 
				+            if debug:
			
 
				+                print(f"Using query: {query}")
			
 
				+            di_data = reader.non_annex_one_reader.query(**query)
			
 
				+    else:
			
 
				+        # detailed query per category (could also be just the top level cat)
			
 
				+
			
 
				+        # read available categories and measures
			
 
				+        if ai_country:
			
 
				+            categories = reader.annex_one_reader.category_tree.all_nodes()
			
 
				+            measures = reader.annex_one_reader.measure_tree.all_nodes()
			
 
				+        else:
			
 
				+            categories = reader.non_annex_one_reader.category_tree.all_nodes()
			
 
				+            measures = reader.non_annex_one_reader.measure_tree.all_nodes()
			
 
				+
			
 
				+                # set data to none so we have the variable for the first category
			
 
				+        di_data = None
			
 
				+
			
 
				+
			
 
				+        for category in category_groups:
			
 
				+            if debug:
			
 
				+                print(f"Working on {category}")
			
 
				+            this_cat_config = category_groups[category]
			
 
				+            # category specific query
			
 
				+            query = query_template.copy()
			
 
				+            for filter in di_query_filters:
			
 
				+                if filter in this_cat_config.keys():
			
 
				+                    query[filter] = this_cat_config[filter]
			
 
				+
			
 
				+            # get the category nodes with the given tag (might be multiple)
			
 
				+            cat_nodes = [cat_node for cat_node in categories if cat_node.tag == category]
			
 
				+            if debug:
			
 
				+                print(f"Found fitting category nodes: {cat_nodes}")
			
 
				+            node_codes = []
			
 
				+            for node in cat_nodes:
			
 
				+                if "read_subsectors" in this_cat_config.keys():
			
 
				+                    read_subsectors_this_cat = this_cat_config["read_subsectors"]
			
 
				+                else:
			
 
				+                    read_subsectors_this_cat = read_subsectors
			
 
				+                if read_subsectors_this_cat:
			
 
				+                    # get the subcategories
			
 
				+                    sub_nodes = reader.non_annex_one_reader.category_tree.subtree(
			
 
				+                        nid=node.identifier).all_nodes()
			
 
				+                    node_codes = node_codes + (
			
 
				+                        [sub_node.identifier for sub_node in sub_nodes])
			
 
				+                else:
			
 
				+                    node_codes = node_codes + [node.identifier]
			
 
				+            if debug:
			
 
				+                print(f"Found node_codes: {node_codes}")
			
 
				+            # add category node_codes to query
			
 
				+            query["category_ids"] = node_codes
			
 
				+
			
 
				+            if "measure" in this_cat_config:
			
 
				+                measure_nodes = [
			
 
				+                    measure_node for measure_node in measures if
			
 
				+                    measure_node.tag in this_cat_config["measure"]]
			
 
				+                if debug:
			
 
				+                    print(f"Found measure_nodes: {measure_nodes}")
			
 
				+                # add measure nodes to query
			
 
				+                query["measure_ids"] = [node.identifier for node in measure_nodes]
			
 
				+            if debug:
			
 
				+                print(query)
			
 
				+
			
 
				+            # read the data. If no data is available for the query the error is caught and a message is printed
			
 
				+            try:
			
 
				+                if ai_country:
			
 
				+                    data_new = reader.annex_one_reader.query(**query)
			
 
				+                else:
			
 
				+                    data_new = reader.non_annex_one_reader.query(**query)
			
 
				+
			
 
				+                n_points = len(data_new)
			
 
				+                n_countries = len(data_new["party"].unique())
			
 
				+                if debug:
			
 
				+                    print(f"Collected {n_points} data points for {n_countries} countries")
			
 
				+                if di_data is None:
			
 
				+                    di_data = data_new
			
 
				+                else:
			
 
				+                    di_data = pd.concat([di_data, data_new])
			
 
				+            except unfccc_di_api.NoDataError:
			
 
				+                print(f"No data for {category}")
			
 
				+
			
 
				+    # if data has been collected print some information and save the data
			
 
				+    if di_data is None:
			
 
				+        raise ValueError(f"No data collected for country {country_code} and category "
			
 
				+                         f"groups "
			
 
				+                         f"{category_groups}")
			
 
				+    elif debug:
			
 
				+        # print some information on collected data
			
 
				+        print(f"Collected data for country {country_code}")
			
 
				+        print("### Categories ###")
			
 
				+        categories = di_data["category"].unique()
			
 
				+        categories.sort()
			
 
				+        print(categories)
			
 
				+        print("### Classifications ###")
			
 
				+        classifications = di_data["classification"].unique()
			
 
				+        classifications.sort()
			
 
				+        print(classifications)
			
 
				+        print("### Measures ###")
			
 
				+        measures = di_data["measure"].unique()
			
 
				+        measures.sort()
			
 
				+        print(measures)
			
 
				+
			
 
				+    return di_data
			
 
				+
			
 
				+
			
 
				+def convert_DI_data_to_pm2_if(
			
 
				+        data: pd.DataFrame,
			
 
				+        pm2if_specifications: Optional[dict]=None,
			
 
				+        default_gwp: Optional[str]=None,
			
 
				+        date_str: Optional[str]=None,
			
 
				+        debug: bool = False,
			
 
				+) -> pd.DataFrame:
			
 
				+    """
			
 
				+    Convert data returned from the unfccc_di_api package to primap2 interchange format
			
 
				+
			
 
				+    TODO: consider moving the specification template into this function and just use the config parameter
			
 
				+    to overwrite certain parameters (makes sense if function is used in a broader context
			
 
				+    """
			
 
				+
			
 
				+    print("Convert data to PRIMAP2 interchange format")
			
 
				+
			
 
				+    # create a copy of the data to avoid data altering the original data
			
 
				+    # this will be done inside the *convert_to_long_dataframe* function
			
 
				+    # in the future. Thus it can be removed here once the category column
			
 
				+    # copy workaround is no longer necessary
			
 
				+    data_temp = data.copy(deep=True)
			
 
				+
			
 
				+    # check which country group we have
			
 
				+    reader = unfccc_di_api.UNFCCCApiReader()
			
 
				+    parties_present_ai = [party for party in data_temp["party"].unique() if party
			
 
				+                          in AI_countries]
			
 
				+    parties_present_nai = [party for party in data_temp["party"].unique() if party
			
 
				+                          in nAI_countries]
			
 
				+    if len(parties_present_ai) > 0:
			
 
				+        if len(parties_present_nai) > 0:
			
 
				+            raise ValueError("AnnexI and non-AnnexI parties present in one dataset. "
			
 
				+                             "This is not possible due to different DI category "
			
 
				+                             "terminologies. Convert to common categories.")
			
 
				+        else:
			
 
				+            ai_dataset = True
			
 
				+    else:
			
 
				+        ai_dataset=False
			
 
				+
			
 
				+    if pm2if_specifications is None:
			
 
				+        if ai_dataset:
			
 
				+            pm2if_specifications = deepcopy(di_to_pm2if_template_ai)
			
 
				+        else:
			
 
				+            pm2if_specifications = deepcopy(di_to_pm2if_template_nai)
			
 
				+
			
 
				+    # modify specifications
			
 
				+    #pm2if_specifications["filter_remove"].update(filter_activity_factors)
			
 
				+
			
 
				+    # set the scenario to today's date if not given explicitly
			
 
				+    if date_str == "country":
			
 
				+        pm2if_specifications["coords_defaults"]["scenario"] = f"DIrolling"
			
 
				+    elif date_str is None:
			
 
				+        today = date.today()
			
 
				+        date_str = today.strftime(DI_date_format)
			
 
				+    pm2if_specifications["coords_defaults"]["scenario"] = f"DI{date_str}"
			
 
				+
			
 
				+    # set metadata
			
 
				+    countries = data["party"].unique()
			
 
				+    if len(countries) > 1:
			
 
				+        pm2if_specifications["meta_data"]["title"] = \
			
 
				+            f"Data submitted to the UNFCCC by countries {countries} as " \
			
 
				+            f"available in the DI interface on {date_str}."
			
 
				+    else:
			
 
				+        try:
			
 
				+            country_info = pycountry.countries.get(alpha_3=countries[0])
			
 
				+            country_name = country_info.name
			
 
				+        except:
			
 
				+            country_name = countries[0]
			
 
				+
			
 
				+        pm2if_specifications["meta_data"]["title"] = \
			
 
				+            f"Data submitted to the UNFCCC by country {country_name} as " \
			
 
				+            f"available in the DI interface on {date_str}."
			
 
				+
			
 
				+    pm2if_specifications["meta_data"]["comment"] = \
			
 
				+        pm2if_specifications["meta_data"]["comment"] + f" Data read on {date_str}."
			
 
				+
			
 
				+    # remove baseyear
			
 
				+    idx_base_year = data_temp["year"] == "Base year"
			
 
				+    data_temp = data_temp.drop(data_temp.index[idx_base_year])
			
 
				+
			
 
				+    # add GWP to entities where necessary
			
 
				+    data_temp["unit"] = data_temp["unit"].replace(to_replace=r"(.*) CO2 equivalent",
			
 
				+                                                value=r"\1CO2eq", regex=True)
			
 
				+    row_idx_co2eq = data_temp["unit"].str.endswith("CO2eq")
			
 
				+    if default_gwp is not None:
			
 
				+        # convert all with GWPs given in input
			
 
				+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
			
 
				+                                              " (SARGWP100)"
			
 
				+    elif ai_dataset:
			
 
				+        # convert with AR4
			
 
				+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
			
 
				+                                              " (AR4GWP100)"
			
 
				+    else:
			
 
				+        # convert with SAR
			
 
				+        data_temp.loc[row_idx_co2eq, "gas"] = data_temp.loc[row_idx_co2eq, "gas"] + \
			
 
				+                                              " (SARGWP100)"
			
 
				+
			
 
				+    # combine numeric and string values
			
 
				+    nan_idx = data_temp["numberValue"].isna()
			
 
				+    data_temp.loc[nan_idx, "numberValue"] = data_temp.loc[nan_idx, "stringValue"]
			
 
				+    data_temp = data_temp.drop(columns=["stringValue"])
			
 
				+
			
 
				+    # Currently in primap2 a data reading a column can only be used once.
			
 
				+    # We want to use the category column both for the primap2 "category"
			
 
				+    # column (which contains the code only) and an additional column which stores
			
 
				+    # the full name as available from the DI API. As a workaround we create a
			
 
				+    # copy of the category column
			
 
				+    if not ai_dataset:
			
 
				+        data_temp["category_copy"] = data_temp["category"]
			
 
				+
			
 
				+    # replace category name and code by just the code
			
 
				+    repl = lambda m: m.group('code')
			
 
				+    data_temp["category"] = data_temp["category"].str.replace(cat_code_regexp, repl,
			
 
				+                                                              regex=True)
			
 
				+
			
 
				+    # convert to pm2 interchange format
			
 
				+    data_pm2if = pm2.pm2io.convert_long_dataframe_if(
			
 
				+        data_temp,
			
 
				+        **pm2if_specifications,
			
 
				+    )
			
 
				+
			
 
				+    return data_pm2if
			
 
				+
			
 
				+
			
 
				+def convert_DI_IF_data_to_pm2(
			
 
				+        data_di_if: pd.DataFrame,
			
 
				+)-> xr.Dataset:
			
 
				+    if_index_cols = set(itertools.chain(*data_di_if.attrs["dimensions"].values()))
			
 
				+    time_cols = set(data_di_if.columns.values) - if_index_cols
			
 
				+    data_di_if.dropna(subset=time_cols, inplace=True, how="all")
			
 
				+
			
 
				+    try:
			
 
				+        # use a copy as from_interchange_format modifies the input DF
			
 
				+        data_pm2 = pm2.pm2io.from_interchange_format(
			
 
				+            data_di_if.copy(deep=True), attrs=copy.deepcopy(data_di_if.attrs))
			
 
				+    except Exception as ex: # better more specific error in primap2
			
 
				+        print(f'Error on conversion to PRIMAP2 native format: {ex}')
			
 
				+
			
 
				+    return data_pm2
			
 
				+
			
 
				+
			
 
				+def save_DI_country_data(
			
 
				+        data_pm2: xr.Dataset,
			
 
				+        raw: bool=True,
			
 
				+):
			
 
				+    '''
			
 
				+    save primap2 and IF data to country folder
			
 
				+    can be used for raw and processed data but for a single country only
			
 
				+    '''
			
 
				+
			
 
				+    # preparations
			
 
				+    data_if = data_pm2.pr.to_interchange_format()
			
 
				+
			
 
				+    ## get country
			
 
				+    countries = data_if[data_pm2.attrs['area']].unique()
			
 
				+    if len(countries) > 1:
			
 
				+        raise ValueError(f"More than one country in input data. This function can only"
			
 
				+                         f"handle single country data. Countries: {countries}")
			
 
				+    else:
			
 
				+        country_code = countries[0]
			
 
				+
			
 
				+    ## get timestamp
			
 
				+    scenario_col = data_pm2.attrs['scen']
			
 
				+    scenarios = data_if[scenario_col].unique()
			
 
				+    if len(scenarios) > 1:
			
 
				+        raise ValueError(f"More than one scenario in input data. This function can only"
			
 
				+                         f"handle single scenario data. Scenarios: {scenarios}")
			
 
				+    else:
			
 
				+        scenario = scenarios[0]
			
 
				+
			
 
				+    date_str = scenario[2:]
			
 
				+
			
 
				+    # calculate the hash of the data to see if it's identical to present data
			
 
				+    data_for_token = data_if.drop(columns=[scenario_col])
			
 
				+    token = tokenize(data_for_token)
			
 
				+
			
 
				+    # get the filename with the hash and check if it exists (separate for pm2 format
			
 
				+    # and IF to fix broken datasets if necessary)
			
 
				+    filename_hash = root_path / determine_filename(country_code, token, raw, hash=True)
			
 
				+
			
 
				+    # primap2 native format
			
 
				+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
			
 
				+    if not filename_hash_nc.exists():
			
 
				+        # if parent dir does not exist create it
			
 
				+        if not filename_hash.parent.exists():
			
 
				+            filename_hash.parent.mkdir()
			
 
				+        # save the data
			
 
				+        print(f"Data has changed. Save to {filename_hash_nc.name}")
			
 
				+        compression = dict(zlib=True, complevel=9)
			
 
				+        encoding = {var: compression for var in data_pm2.data_vars}
			
 
				+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
			
 
				+
			
 
				+    # primap2 IF
			
 
				+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
			
 
				+    if not filename_hash_csv.exists():
			
 
				+        # save the data
			
 
				+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
			
 
				+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
			
 
				+    else:
			
 
				+        print(f"Data unchanged for {country_code}. Create symlinks.")
			
 
				+
			
 
				+    # get the filename with the date
			
 
				+    filename_date = root_path / determine_filename(country_code, date_str, raw)
			
 
				+
			
 
				+    # create the symlinks to the actual data (with the hash)
			
 
				+    suffixes = ['.nc', '.csv', '.yaml']
			
 
				+    for suffix in suffixes:
			
 
				+        file_date = filename_date.parent / (filename_date.name + suffix)
			
 
				+        file_hash = filename_hash.name + suffix
			
 
				+        if file_date.exists():
			
 
				+            file_date.unlink()
			
 
				+        file_date.symlink_to(file_hash)
			
 
				+
			
 
				+
			
 
				+def save_DI_dataset(
			
 
				+        data_pm2: xr.Dataset,
			
 
				+        raw: bool=True,
			
 
				+        annexI: bool=False,
			
 
				+):
			
 
				+    '''
			
 
				+    save primap2 and IF data to dataset folder
			
 
				+    can be used for raw and processed data but not to save to country folders
			
 
				+    '''
			
 
				+
			
 
				+    # preparations
			
 
				+    data_if = data_pm2.pr.to_interchange_format()
			
 
				+    if annexI:
			
 
				+        country_group = "AnnexI"
			
 
				+    else:
			
 
				+        country_group = "non-AnnexI"
			
 
				+
			
 
				+
			
 
				+    ## get timestamp
			
 
				+    scenario_col = data_pm2.attrs['scen']
			
 
				+    scenarios = data_if[scenario_col].unique()
			
 
				+    if len(scenarios) > 1:
			
 
				+        raise ValueError(f"More than one scenario in input data. This function can only"
			
 
				+                         f"handle single scenario data. Scenarios: {scenarios}")
			
 
				+    else:
			
 
				+        scenario = scenarios[0]
			
 
				+
			
 
				+    date_str = scenario[2:]
			
 
				+
			
 
				+    # calculate the hash of the data to see if it's identical to present data
			
 
				+    data_for_token = data_if.drop(columns=[scenario_col])
			
 
				+    token = tokenize(data_for_token)
			
 
				+
			
 
				+    # get the filename with the hash and check if it exists (separate for pm2 format
			
 
				+    # and IF to fix broken datasets if necessary)
			
 
				+    filename_hash = root_path / determine_dataset_filename(token, raw, annexI=annexI,
			
 
				+                                               hash=True)
			
 
				+    # primap2 native format
			
 
				+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
			
 
				+    if not filename_hash_nc.exists():
			
 
				+        # if parent dir does not exist create it
			
 
				+        # TODO double, also in determine_dataset_filename. same for country data
			
 
				+        if not filename_hash.parent.exists():
			
 
				+            filename_hash.parent.mkdir()
			
 
				+        # save the data
			
 
				+        print(f"Data has changed. Save to {filename_hash_nc.name}")
			
 
				+        compression = dict(zlib=True, complevel=9)
			
 
				+        encoding = {var: compression for var in data_pm2.data_vars}
			
 
				+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
			
 
				+
			
 
				+    # primap2 IF
			
 
				+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
			
 
				+    if not filename_hash_csv.exists():
			
 
				+        # save the data
			
 
				+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
			
 
				+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
			
 
				+    else:
			
 
				+        print(f"Data unchanged for {country_group}. Create symlinks.")
			
 
				+
			
 
				+    # get the filename with the date
			
 
				+    filename_date = root_path / determine_dataset_filename(date_str, raw=raw,
			
 
				+                                               annexI=annexI, hash=False)
			
 
				+
			
 
				+    # create the symlinks to the actual data (with the hash)
			
 
				+    suffixes = ['.nc', '.csv', '.yaml']
			
 
				+    for suffix in suffixes:
			
 
				+        file_date = filename_date.parent / (filename_date.name + suffix)
			
 
				+        file_hash = filename_hash.name + suffix
			
 
				+        if file_date.exists():
			
 
				+            file_date.unlink()
			
 
				+        file_date.symlink_to(file_hash)
			
 
				+
			
 
				+
			
 
				+## functions for multiple country reading
			
 
				+def read_UNFCCC_DI_for_country_group(
			
 
				+        annexI: bool=False,
			
 
				+) -> xr.Dataset:
			
 
				+    '''
			
 
				+    This function reads DI data for all countries in a group (annexI or non-AnnexI)
			
 
				+    The function reads all data in one go using datalad run. as the output data file
			
 
				+    names are unknown beforehand datalad run uses explicit=false
			
 
				+    '''
			
 
				+
			
 
				+    today = date.today()
			
 
				+    date_str = today.strftime(DI_date_format)
			
 
				+
			
 
				+    if annexI:
			
 
				+        countries = AI_countries
			
 
				+        data_all_if = None
			
 
				+        country_group = "AnnexI"
			
 
				+    else:
			
 
				+        countries = nAI_countries
			
 
				+        data_all = None
			
 
				+        country_group = "non-AnnexI"
			
 
				+
			
 
				+    # read the data
			
 
				+    for country in countries:
			
 
				+        print(f"reading DI data for country {country}")
			
 
				+
			
 
				+        try:
			
 
				+            data_country = read_UNFCCC_DI_for_country(
			
 
				+                country_code=country,
			
 
				+                category_groups=None,  # read all categories
			
 
				+                read_subsectors=False,  # not applicable as we read all categories
			
 
				+                date_str=date_str,
			
 
				+                pm2if_specifications=None,
			
 
				+                # automatically use the right specs for AI and NAI
			
 
				+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
			
 
				+                debug=False)
			
 
				+
			
 
				+            if annexI:
			
 
				+            # annexI data has additional dimensions and unfortunately the xarray
			
 
				+            # merge function needs some extra memory which is not needed when
			
 
				+            # converting from IF to pm2
			
 
				+                if data_all_if is None:
			
 
				+                    data_all_if = data_country.pr.to_interchange_format()
			
 
				+                    attrs = data_all_if.attrs
			
 
				+                else:
			
 
				+                    data_all_if = pd.concat([data_all_if,
			
 
				+                                          data_country.pr.to_interchange_format()])
			
 
				+            else:
			
 
				+                if data_all is None:
			
 
				+                    data_all = data_country
			
 
				+                else:
			
 
				+                    data_all = data_all.pr.merge(data_country)
			
 
				+
			
 
				+        except unfccc_di_api.NoDataError as err:
			
 
				+            print(f"No data for {country}.")
			
 
				+            print(err)
			
 
				+
			
 
				+    if annexI:
			
 
				+        data_all = pm2.pm2io.from_interchange_format(data_all_if, attrs=attrs,
			
 
				+                                                     max_array_size=500000000000)
			
 
				+
			
 
				+    countries_present = list(data_all.coords[data_all.attrs['area']].values)
			
 
				+    data_all.attrs["title"] = f"Data submitted by the following {country_group} " \
			
 
				+                              f"countries and available in the DI interface on " \
			
 
				+                              f"{date_str}: {', '.join(countries_present)}"
			
 
				+
			
 
				+    # save the data
			
 
				+    save_DI_dataset(data_all, raw=True, annexI=annexI)
			
 
				+
			
 
				+    return data_all
			
 
				+
			
 
				+
			
 
				+def process_UNFCCC_DI_for_country_group(
			
 
				+        annexI: bool=False,
			
 
				+) -> xr.Dataset:
			
 
				+    '''
			
 
				+    This function processes DI data for all countries in a group (annexI or non-AnnexI)
			
 
				+    TODO: currently only non-annexI is implemented
			
 
				+    The function processes all data in one go using datalad run. as the output data file
			
 
				+    names are unknown beforehand datalad run uses explicit=false
			
 
				+
			
 
				+    TODO: use the latest
			
 
				+
			
 
				+
			
 
				+    '''
			
 
				+
			
 
				+    today = date.today()
			
 
				+    date_str = today.strftime(DI_date_format)
			
 
				+
			
 
				+    if annexI:
			
 
				+        raise ValueError("Bulk reading for AnnexI countries not implemented yet")
			
 
				+    else:
			
 
				+        countries = nAI_countries
			
 
				+
			
 
				+    # read the data
			
 
				+    data_all = None
			
 
				+    for country in countries[0:5]:
			
 
				+        print(f"reading DI data for country {country}")
			
 
				+
			
 
				+        try:
			
 
				+            data_country = read_UNFCCC_DI_for_country(
			
 
				+                country_code=country,
			
 
				+                category_groups=None,  # read all categories
			
 
				+                read_subsectors=False,  # not applicable as we read all categories
			
 
				+                date_str=date_str,
			
 
				+                pm2if_specifications=None,
			
 
				+                # automatically use the right specs for AI and NAI
			
 
				+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
			
 
				+                debug=False)
			
 
				+
			
 
				+            if data_all is None:
			
 
				+                data_all = data_country
			
 
				+            else:
			
 
				+                data_all = data_all.pr.merge(data_country)
			
 
				+        except unfccc_di_api.NoDataError as err:
			
 
				+            print(f"No data for {country}.")
			
 
				+            print(err)
			
 
				+
			
 
				+    # TODO: write metadata
			
 
				+
			
 
				+    # save the data
			
 
				+    save_DI_dataset(data_all, raw=True, annexI=annexI)
			
 
				+
			
 
				+    return data_all
			
 
				+
			
 
				+# TODO: add interface functions and script for read all data
			
 
				+# add process all sfunctios and scripts
			
 
				+# merge into main
			
 
				+# rund reading procedure
			
 
				+# config for all DI data
			
 
				+# re-run crf etc
			
 
				+
			
 
				+
			
 
				+## datalad and pydoit interface functions
			
 
				+def read_DI_for_country_datalad(
			
 
				+        country: str,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Wrapper around read_UNFCCC_DI_for_country which takes care of selecting input
			
 
				+    and output files and using datalad run to trigger the data reading
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+
			
 
				+    country: str
			
 
				+        country name or ISO 3-letter country code
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    # get date to determine output filename
			
 
				+    today = date.today()
			
 
				+    date_str = today.strftime(DI_date_format)
			
 
				+
			
 
				+    # get all the info for the country
			
 
				+    country_info = get_input_and_output_files_for_country_DI(country, date_str,
			
 
				+                                                             raw=True, verbose=True)
			
 
				+
			
 
				+    print(f"Attempting to read DI data for {country_info['name']}.")
			
 
				+    print("#"*80)
			
 
				+    print("")
			
 
				+    print(f"Using the UNFCCC_DI_reader")
			
 
				+    print("")
			
 
				+    print(f"Run the script using datalad run via the python api")
			
 
				+    script = code_path / "UNFCCC_DI_reader" / "read_UNFCCC_DI_for_country.py"
			
 
				+    script = script.relative_to(root_path)
			
 
				+
			
 
				+    cmd = f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} " \
			
 
				+          f"--date={date_str}"
			
 
				+    try:
			
 
				+        datalad.api.run(
			
 
				+            cmd=cmd,
			
 
				+            dataset=root_path,
			
 
				+            message=f"Read DI data for {country_info['name']}.",
			
 
				+            inputs=country_info["input"],
			
 
				+            outputs=country_info["output"],
			
 
				+            dry_run=None,
			
 
				+            explicit=False,
			
 
				+        )
			
 
				+    except IncompleteResultsError as IRE:
			
 
				+        print(f"IncompleteResultsError occured when running {cmd}: {IRE}")
			
 
				+    except Exception as ex:
			
 
				+        print(f"Exception occurred when running {cmd}")
			
 
				+        print(ex.message)
			
 
				+
			
 
				+
			
 
				+def process_DI_for_country_datalad(
			
 
				+        country: str,
			
 
				+        date_str: Union[str, None],
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Wrapper around process_UNFCCC_DI_for_country which takes care of selecting input
			
 
				+    and output files and using datalad run to trigger the data processing
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+
			
 
				+    country: str
			
 
				+        country name or ISO 3-letter country code
			
 
				+    date_str: str
			
 
				+        Date of the data to be processed in the format %Y-%m-%d (e.g. 2023-01-30). If
			
 
				+        no date is given the last data read will be processed.
			
 
				+    """
			
 
				+
			
 
				+    # get all the info for the country
			
 
				+    country_info = get_input_and_output_files_for_country_DI(country, date_str,
			
 
				+                                                             raw=True, verbose=True)
			
 
				+
			
 
				+    print(f"Attempting to process DI data for {country_info['name']}.")
			
 
				+    print("#"*80)
			
 
				+    print("")
			
 
				+    print(f"Using the UNFCCC_DI_reader")
			
 
				+    print("")
			
 
				+    print(f"Run the script using datalad run via the python api")
			
 
				+    script = code_path / "UNFCCC_DI_reader" / "process_UNFCCC_DI_for_country.py"
			
 
				+    script = script.relative_to(root_path)
			
 
				+
			
 
				+    cmd = f"./venv/bin/python3 {script.as_posix()} --country={country_info['code']} " \
			
 
				+          f"--date={date_str}"
			
 
				+    try:
			
 
				+        datalad.api.run(
			
 
				+            cmd=cmd,
			
 
				+            dataset=root_path,
			
 
				+            message=f"Read DI data for {country_info['name']}.",
			
 
				+            inputs=country_info["input"],
			
 
				+            outputs=country_info["output"],
			
 
				+            dry_run=None,
			
 
				+            explicit=False,
			
 
				+        )
			
 
				+    except IncompleteResultsError as IRE:
			
 
				+        print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
			
 
				+    except Exception as ex:
			
 
				+        print(f"Exception occurred when running {cmd}")
			
 
				+        print(ex.message)
			
 
				+
			
 
				+
			
 
				+def read_DI_for_country_group_datalad(
			
 
				+        annexI: bool=False,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Wrapper around read_UNFCCC_DI_for_country_group which takes care of selecting input
			
 
				+    and output files and using datalad run to trigger the data processing
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+
			
 
				+    country: str
			
 
				+        country name or ISO 3-letter country code
			
 
				+    date_str: str
			
 
				+        Date of the data to be processed in the format %Y-%m-%d (e.g. 2023-01-30). If
			
 
				+        no date is given the last data read will be processed.
			
 
				+    """
			
 
				+
			
 
				+    if annexI:
			
 
				+        country_group = "AnnexI"
			
 
				+    else:
			
 
				+        country_group = "non-AnnexI"
			
 
				+
			
 
				+    print(f"Attempting to read DI data for {country_group}.")
			
 
				+    print("#"*80)
			
 
				+    print("")
			
 
				+    print(f"Using the UNFCCC_DI_reader")
			
 
				+    print("")
			
 
				+    print(f"Run the script using datalad run via the python api")
			
 
				+    script = code_path / "UNFCCC_DI_reader" / "read_UNFCCC_DI_for_country_group.py"
			
 
				+    script = script.relative_to(root_path)
			
 
				+
			
 
				+    cmd = f"./venv/bin/python3 {script.as_posix()} "
			
 
				+    if annexI:
			
 
				+        cmd = cmd + f" --annexI"
			
 
				+
			
 
				+    try:
			
 
				+        datalad.api.run(
			
 
				+            cmd=cmd,
			
 
				+            dataset=root_path,
			
 
				+            message=f"Read DI data for {country_group}.",
			
 
				+            inputs=[],
			
 
				+            outputs=[],
			
 
				+            dry_run=None,
			
 
				+            explicit=False,
			
 
				+        )
			
 
				+    except IncompleteResultsError as IRE:
			
 
				+        print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
			
 
				+    except Exception as ex:
			
 
				+        print(f"Exception occurred when running {cmd}")
			
 
				+        print(ex.message)
			
 
				+
			
 
				+
			
 
				+## helper functions
			
 
				+def determine_filename(
			
 
				+        country_code: str,
			
 
				+        date_or_hash: str,
			
 
				+        raw: bool=False,
			
 
				+        hash: bool=False,
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    Determine the filename for a dataset from given country code and date string.
			
 
				+
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    country_code: str
			
 
				+        ISO 3 letter code of the country
			
 
				+    date_or_hash:
			
 
				+        formatted date string
			
 
				+    raw: bool
			
 
				+        bool specifying if filename fow raw or processed data should be returned
			
 
				+    hash: str
			
 
				+
			
 
				+    Returns
			
 
				+    _______
			
 
				+        pathlib Path object for the file name (without suffix)
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    # get the country folder
			
 
				+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code in folder_mapping:
			
 
				+        file_filter = {}
			
 
				+        file_filter["party"] = country_code
			
 
				+        country_folders = folder_mapping[country_code]
			
 
				+        if isinstance(country_folders, str):
			
 
				+            # only one folder
			
 
				+            country_folder = extracted_data_path_UNFCCC / country_folders
			
 
				+        else:
			
 
				+            raise ValueError("More than one output folder for country "
			
 
				+                             f"{country_code}. This should not happen.")
			
 
				+    else:
			
 
				+        # folder not in mapping. It will be created if not present yet
			
 
				+        country_name = get_country_name(country_code)
			
 
				+        country_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
			
 
				+
			
 
				+        if country_folder.exists():
			
 
				+           print(f"Output folder {country_name.replace(' ', '_')} for country "
			
 
				+                 f"{country_code} exists but is not in folder mapping. Update "
			
 
				+                 "folder mapping")
			
 
				+        else:
			
 
				+            country_folder.mkdir()
			
 
				+
			
 
				+    filename = f"{country_code}_DI_{date_or_hash}"
			
 
				+    if raw:
			
 
				+        filename = f"{filename}_raw"
			
 
				+    if hash:
			
 
				+        filename = f"{filename}_hash"
			
 
				+    filename = country_folder / filename
			
 
				+
			
 
				+    return filename.relative_to(root_path)
			
 
				+
			
 
				+
			
 
				+def determine_dataset_filename(
			
 
				+        date_or_hash: str,
			
 
				+        raw: bool=False,
			
 
				+        annexI: bool=False,
			
 
				+        hash: bool = False,
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    Determine the filename for a dataset from given country group and date string.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    date_or_hash:
			
 
				+        formatted date string
			
 
				+    raw: bool
			
 
				+        bool specifying if filename fow raw or processed data should be returned
			
 
				+    annexI: bool, default False
			
 
				+        True if AnnexI data, False if non-AnnexI data
			
 
				+    hash: str
			
 
				+
			
 
				+    Returns
			
 
				+    _______
			
 
				+        pathlib Path object for the file name (without suffix)
			
 
				+    """
			
 
				+
			
 
				+    # get the country folder
			
 
				+    if annexI:
			
 
				+        current_dataset_path = dataset_path_UNFCCC / "DI_AnnexI"
			
 
				+        filename = f"DI_AnnexI_{date_or_hash}"
			
 
				+    else:
			
 
				+        current_dataset_path = dataset_path_UNFCCC / "DI_non_AnnexI"
			
 
				+        filename = f"DI_non_AnnexI_{date_or_hash}"
			
 
				+
			
 
				+    if not current_dataset_path.exists():
			
 
				+        current_dataset_path.mkdir()
			
 
				+
			
 
				+    if raw:
			
 
				+        filename = f"{filename}_raw"
			
 
				+    if hash:
			
 
				+        filename = f"{filename}_hash"
			
 
				+    filename = current_dataset_path / filename
			
 
				+
			
 
				+    return filename.relative_to(root_path)
			
 
				+
			
 
				+
			
 
				+def get_input_and_output_files_for_country_DI(
			
 
				+        country: str,
			
 
				+        date_str: str,
			
 
				+        raw: bool,
			
 
				+        verbose: Optional[bool]=True,
			
 
				+) -> Dict[str, Union[List, str]]:
			
 
				+    """
			
 
				+    Get input and output files for a given country
			
 
				+    """
			
 
				+
			
 
				+    country_info = {}
			
 
				+
			
 
				+    if country in custom_country_mapping:
			
 
				+        country_code = country
			
 
				+    else:
			
 
				+        country_code = get_country_code(country)
			
 
				+    # now get the country name
			
 
				+    country_name = get_country_name(country_code)
			
 
				+    country_info["code"] = country_code
			
 
				+    country_info["name"] = country_name
			
 
				+    # now get the country name
			
 
				+    country_name = get_country_name(country_code)
			
 
				+    country_info["code"] = country_code
			
 
				+    country_info["name"] = country_name
			
 
				+
			
 
				+    # determine latest data
			
 
				+    print(f"Determining output files for {country_name}")
			
 
				+
			
 
				+    # get input files (only for processing)
			
 
				+    if raw:
			
 
				+        input_files = []
			
 
				+    else:
			
 
				+        # get latest dataset if no date given
			
 
				+        if date_str is None:
			
 
				+            # get the latest date
			
 
				+            input_file = [find_latest_DI_data(country_code, raw=True)]
			
 
				+        else:
			
 
				+            input_file = [determine_filename(country_code, date_str, raw=False,
			
 
				+                                               hash=False)]
			
 
				+            if input_file[0].is_symlink():
			
 
				+                # also get the file with the actual data
			
 
				+                input_file.append(input_file[0].readlink())
			
 
				+            else:
			
 
				+                # DI processing input files wit date labels should always be symlinks
			
 
				+                # to the files with hashes holding the actual data.
			
 
				+                raise(ValueError, f"Input file {input_file[0].name} is not a symlink "
			
 
				+                                  f" or not existent. Check if the data you want to "
			
 
				+                                  f"process exists and if your repository is ")
			
 
				+
			
 
				+        input_files = [f"{input_file.as_posix()}.{suffix}" for
			
 
				+                        suffix in ['yaml', 'csv', 'nc']]
			
 
				+
			
 
				+        if verbose:
			
 
				+            print(f"The following files are considered as input_files:")
			
 
				+            for file in input_files:
			
 
				+                print(file)
			
 
				+            print("")
			
 
				+
			
 
				+    # get output files
			
 
				+    output_file = determine_filename(country_code, date_str, raw=raw)
			
 
				+    output_files = [f"{output_file.as_posix()}.{suffix}" for
			
 
				+                    suffix in ['yaml', 'csv', 'nc']]
			
 
				+
			
 
				+    if verbose:
			
 
				+        print(f"The following files are considered as output_files:")
			
 
				+        for file in output_files:
			
 
				+            print(file)
			
 
				+        print("")
			
 
				+
			
 
				+    # add to country info
			
 
				+    country_info["input"] = input_files
			
 
				+    country_info["output"] = [] #output_files # not used because we don't know the
			
 
				+    # hash in advance
			
 
				+
			
 
				+    return country_info
			
 
				+
			
 
				+
			
 
				+def get_present_hashes_for_country_DI(
			
 
				+        country_code: str,
			
 
				+        raw: bool,
			
 
				+) -> List:
			
 
				+    '''
			
 
				+    Get the hashes of outputs
			
 
				+    '''
			
 
				+
			
 
				+    regex_hash = r"_([a-f0-9]*)_"
			
 
				+    if raw:
			
 
				+        regex_hash = regex_hash + "raw_hash\.nc"
			
 
				+    else:
			
 
				+        regex_hash = regex_hash + "hash\.nc"
			
 
				+
			
 
				+    # get the country folder
			
 
				+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code in folder_mapping:
			
 
				+        file_filter = {}
			
 
				+        file_filter["party"] = country_code
			
 
				+        country_folders = folder_mapping[country_code]
			
 
				+        if isinstance(country_folders, str):
			
 
				+            # only one folder
			
 
				+            country_folder = extracted_data_path_UNFCCC / country_folders
			
 
				+        else:
			
 
				+            raise ValueError("More than one output folder for country "
			
 
				+                             f"{country_code}. This should not happen.")
			
 
				+
			
 
				+        files_list = list(country_folder.glob("*_hash.nc"))
			
 
				+        # filter according to raw flag
			
 
				+        if raw:
			
 
				+            files_list = [file.name for file in files_list if
			
 
				+                          re.search(r'_raw_hash', file.name)]
			
 
				+        else:
			
 
				+            files_list = [file.name for file in files_list if
			
 
				+                          not re.search(r'_raw_hash', file.name)]
			
 
				+
			
 
				+        hash_list = [re.findall(regex_hash, file)[0] for file in files_list]
			
 
				+        return hash_list
			
 
				+
			
 
				+    else:
			
 
				+        # folder not in mapping.
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+def find_latest_DI_data(
			
 
				+        country_code: str,
			
 
				+        raw: bool=True,
			
 
				+)->Union[Path, None]:
			
 
				+    '''
			
 
				+    Find the path to the nc file with the latest DI data for a given country
			
 
				+    '''
			
 
				+
			
 
				+    if raw:
			
 
				+        regex = regex_date + r"_raw\.nc"
			
 
				+    else:
			
 
				+        regex = regex_date + r"\.nc"
			
 
				+
			
 
				+    # get the country folder
			
 
				+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code in folder_mapping:
			
 
				+        file_filter = {}
			
 
				+        file_filter["party"] = country_code
			
 
				+        country_folders = folder_mapping[country_code]
			
 
				+        if isinstance(country_folders, str):
			
 
				+            # only one folder
			
 
				+            country_folder = extracted_data_path_UNFCCC / country_folders
			
 
				+        else:
			
 
				+            raise ValueError("More than one output folder for country "
			
 
				+                             f"{country_code}. This should not happen.")
			
 
				+
			
 
				+        files_path_list = list(country_folder.glob("*.nc"))
			
 
				+        # remove files with hash
			
 
				+        files_list = [file.name for file in files_path_list
			
 
				+                      if not re.search(r'_hash\.nc', file.name)]
			
 
				+        # filter according to raw flag
			
 
				+        if raw:
			
 
				+            files_list = [file for file in files_list if
			
 
				+                          re.search(r'_raw\.nc', file)]
			
 
				+        else:
			
 
				+            files_list = [file for file in files_list if
			
 
				+                          not re.search(r'_raw\.nc', file)]
			
 
				+
			
 
				+        if len(files_list) > 0:
			
 
				+            date_list = [re.findall(regex, file)[0] for file in files_list]
			
 
				+            latest_date = find_latest_date(date_list, '%Y-%m-%d')
			
 
				+            latest_file = [file for file in files_path_list if re.search(latest_date,
			
 
				+                                                                         file.name)][0]
			
 
				+            return latest_file
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    else:
			
 
				+        # folder not in mapping.
			
 
				+        return None
			
 
				+
			
 
				+# TODO
			
 
				+
			
 
				+# functions
			
 
				+
			
 
				+# def compare_with_existing
			
 
				+# def
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
@@ -0,0 +1,23 @@
 
				+# submodule to read data from UNFCCC DI API using the unfccc_di_api package
			
 
				+
			
 
				+#import unfccc_di_api
			
 
				+from .UNFCCC_DI_reader_core import \
			
 
				+    read_UNFCCC_DI_for_country, read_DI_for_country_datalad, \
			
 
				+    process_UNFCCC_DI_for_country, process_and_save_UNFCCC_DI_for_country, \
			
 
				+    process_DI_for_country_datalad, \
			
 
				+    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename, \
			
 
				+    read_UNFCCC_DI_for_country_group, read_DI_for_country_group_datalad
			
 
				+
			
 
				+
			
 
				+__all__ = [
			
 
				+    "read_UNFCCC_DI_for_country",
			
 
				+    "read_DI_for_country_datalad",
			
 
				+    "process_UNFCCC_DI_for_country",
			
 
				+    "process_and_save_UNFCCC_DI_for_country",
			
 
				+    "process_DI_for_country_datalad",
			
 
				+    "convert_DI_data_to_pm2_if",
			
 
				+    "convert_DI_IF_data_to_pm2",
			
 
				+    "determine_filename",
			
 
				+    "read_UNFCCC_DI_for_country_group",
			
 
				+    "read_DI_for_country_group_datalad",
			
 
				+]
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country.py
@@ -0,0 +1,26 @@
 
				+"""
			
 
				+This script is a wrapper around the read__for_country
			
 
				+function such that it can be called from datalad
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
			
 
				+    process_and_save_UNFCCC_DI_for_country
			
 
				+
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country code')
			
 
				+parser.add_argument('--date', help='String with date to read and process. If not '
			
 
				+                                   'given latest data will be used')
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+country_code = args.country
			
 
				+date_str = args.date
			
 
				+
			
 
				+if date_str == "None":
			
 
				+    date_str = None
			
 
				+
			
 
				+process_and_save_UNFCCC_DI_for_country(
			
 
				+    country_code=country_code,
			
 
				+    date_str=date_str,
			
 
				+)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+wrapper around read_crf_for_country_datalad such that it can be called
			
 
				+from doit in the current setup where doit runs on system python and
			
 
				+not in the venv.
			
 
				+"""
			
 
				+
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
			
 
				+    process_DI_for_country_datalad
			
 
				+import argparse
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country name or code')
			
 
				+parser.add_argument('--date', help='String with date to read and process. If not '
			
 
				+                                   'given latest data will be used')
			
 
				+args = parser.parse_args()
			
 
				+country = args.country
			
 
				+date_str = args.date
			
 
				+
			
 
				+if date_str == "None":
			
 
				+    date_str = None
			
 
				+
			
 
				+process_DI_for_country_datalad(country, date_str=date_str)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country.py
@@ -0,0 +1,27 @@
 
				+"""
			
 
				+This script is a wrapper around the read__for_country
			
 
				+function such that it can be called from datalad
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader.UNFCCC_DI_reader_core import \
			
 
				+    read_UNFCCC_DI_for_country
			
 
				+
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country code')
			
 
				+parser.add_argument('--date', help='String with current date')
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+country_code = args.country
			
 
				+date_str = args.date
			
 
				+
			
 
				+read_UNFCCC_DI_for_country(
			
 
				+    country_code=country_code,
			
 
				+    category_groups=None, # read all categories
			
 
				+    read_subsectors=False, # not applicable as we read all categories
			
 
				+    date_str=date_str,
			
 
				+    pm2if_specifications=None, # automatically use the right specs for AI and NAI
			
 
				+    default_gwp=None, # automatically uses right default GWP for AI and NAI
			
 
				+    debug=False,
			
 
				+)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py
@@ -0,0 +1,17 @@
 
				+"""
			
 
				+wrapper around read_crf_for_country_datalad such that it can be called
			
 
				+from doit in the current setup where doit runs on system python and
			
 
				+not in the venv.
			
 
				+"""
			
 
				+
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
			
 
				+    read_DI_for_country_datalad
			
 
				+import argparse
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country name or code')
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+country = args.country
			
 
				+
			
 
				+read_DI_for_country_datalad(country)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group.py
@@ -0,0 +1,19 @@
 
				+"""
			
 
				+This script is a wrapper around the read_UNFCCC_DI_for_country_group
			
 
				+function such that it can be called from datalad
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
			
 
				+    read_UNFCCC_DI_for_country_group
			
 
				+
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--annexI', help='read for AnnexI countries (default is for '
			
 
				+                                     'non-AnnexI)', action='store_true')
			
 
				+args = parser.parse_args()
			
 
				+annexI = args.annexI
			
 
				+
			
 
				+read_UNFCCC_DI_for_country_group(
			
 
				+    annexI=annexI,
			
 
				+)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py
@@ -0,0 +1,19 @@
 
				+"""
			
 
				+wrapper around read_crf_for_country_datalad such that it can be called
			
 
				+from doit in the current setup where doit runs on system python and
			
 
				+not in the venv.
			
 
				+"""
			
 
				+
			
 
				+from UNFCCC_GHG_data.UNFCCC_DI_reader import \
			
 
				+    read_DI_for_country_group_datalad
			
 
				+import argparse
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--annexI', help='read for AnnexI countries (default is for '
			
 
				+                                     'non-AnnexI)', action='store_true')
			
 
				+args = parser.parse_args()
			
 
				+annexI = args.annexI
			
 
				+
			
 
				+read_DI_for_country_group_datalad(
			
 
				+    annexI=annexI,
			
 
				+)
			
--- a/UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
+++ b/UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
@@ -0,0 +1,13 @@
 
				+import unfccc_di_api
			
 
				+
			
 
				+reader = unfccc_di_api.UNFCCCApiReader()
			
 
				+nAI_countries = list(reader.non_annex_one_reader.parties["code"])
			
 
				+AI_countries = list(reader.annex_one_reader.parties["code"])
			
 
				+
			
 
				+DI_date_format = '%Y-%m-%d'
			
 
				+regex_date = r"([0-9]{4}-[0-9]{2}-[0-9]{2})"
			
 
				+
			
 
				+class NoDIDataError(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/__init__.py
@@ -0,0 +1,5 @@
 
				+from .unfccc_submission_info import get_unfccc_submission_info
			
 
				+
			
 
				+__all__ = [
			
 
				+    "get_unfccc_submission_info",
			
 
				+]
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
@@ -11,7 +11,7 @@ from selenium.webdriver.firefox.options import Options
 
				 from random import randrange
			
 
				 from pathlib import Path
			
 
				 
			
 
				-root = Path(__file__).parents[2]
			
 
				+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
			
 
				 
			
 
				 ###############
			
 
				 #
			
@@ -77,12 +77,10 @@ else:
 
				         "submissions/national-inventory-submissions-{}".format(year)
			
 
				     )
			
 
				 
			
 
				-download_path = root / "downloaded_data" / "UNFCCC"
			
 
				-
			
 
				 error_file_sizes = [212, 210]
			
 
				 
			
 
				 # Read submissions list
			
 
				-submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
			
 
				+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")
			
 
				 
			
 
				 # filter submissions list or category
			
 
				 items = submissions[submissions.Kind  == category.upper()]
			
@@ -120,7 +118,7 @@ for idx, submission in items.iterrows():
 
				     country = country.replace(' ', '_')
			
 
				     print(f"Downloading {title} from {url}")
			
 
				 
			
 
				-    country_folder = download_path / country
			
 
				+    country_folder = downloaded_data_path_UNFCCC / country
			
 
				     if not country_folder.exists():
			
 
				         country_folder.mkdir()
			
 
				     local_filename = \
			
@@ -136,7 +134,7 @@ for idx, submission in items.iterrows():
 
				             os.remove(local_filename)
			
 
				     
			
 
				     # now we have removed error pages, so a present file should not be overwritten
			
 
				-    if not local_filename.exists():
			
 
				+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
			
 
				         i = 0  # reset counter
			
 
				         while not local_filename.exists() and i < 10:
			
 
				             # for i = 0 and i = 5 try to get a new session ID
			
@@ -167,7 +165,7 @@ for idx, submission in items.iterrows():
 
				             
			
 
				         if local_filename.exists():
			
 
				             new_downloaded.append(submission)
			
 
				-            print(f"Download => {local_filename.relative_to(root)}")
			
 
				+            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				             # unzip data (only for new downloads)
			
 
				             if local_filename.suffix == ".zip":
			
 
				                 try:
			
@@ -177,18 +175,21 @@ for idx, submission in items.iterrows():
 
				                     zipped_file.close()
			
 
				                 # TODO Better error logging/visibilty
			
 
				                 except zipfile.BadZipFile:
			
 
				-                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
			
 
				+                    print(f"Error while trying to extract "
			
 
				+                          f"{local_filename.relative_to(root_path)}")
			
 
				                 except NotImplementedError:
			
 
				                     print("Zip format not supported, please unzip on the command line.")
			
 
				             else:
			
 
				-                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
			
 
				+                print(f"Not attempting to extract "
			
 
				+                      f"{local_filename.relative_to(root_path)}.")
			
 
				         else:
			
 
				-            print(f"Failed to download {local_filename.relative_to(root)}")
			
 
				+            print(f"Failed to download {local_filename.relative_to(root_path)}")
			
 
				 
			
 
				     else:
			
 
				-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
			
 
				+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
			
 
				 
			
 
				 driver.close()
			
 
				 
			
 
				 df = pd.DataFrame(new_downloaded)
			
 
				-df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC
			
 
				+          / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
@@ -3,11 +3,12 @@ import requests
 
				 import shutil
			
 
				 import time
			
 
				 import os
			
 
				+import re
			
 
				 from datetime import date
			
 
				 from random import randrange
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				 from pathlib import Path
			
 
				-root = Path(__file__).parents[2]
			
 
				+
			
 
				 """
			
 
				 based on download_bur from national-inventory-submissions
			
 
				 # (https://github.com/openclimatedata/national-inventory-submisions)
			
@@ -33,19 +34,20 @@ url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
 
				 # TODO error page sizes are from BUR and NC and might differ for NDCs
			
 
				 # if an error page is found instead of a pdf adjust sizes here
			
 
				 error_file_sizes = [212, 210]
			
 
				+ndc_regex = r".*\s([A-Za-z]*)\sNDC"
			
 
				 
			
 
				 # Ensure download path and subfolders exist
			
 
				-download_path = root / "downloaded_data" / "UNFCCC"
			
 
				-if not download_path.exists():
			
 
				-    download_path.mkdir(parents=True)
			
 
				+if not downloaded_data_path_UNFCCC.exists():
			
 
				+    downloaded_data_path_UNFCCC.mkdir(parents=True)
			
 
				 
			
 
				 new_downloaded = []
			
 
				 
			
 
				-
			
 
				 for idx, submission in submissions.iterrows():
			
 
				     print("=" * 60)
			
 
				-    ndc = submission.Number
			
 
				+    #ndc = submission.Number
			
 
				     title = submission.Title
			
 
				+    temp = re.findall(ndc_regex, title)
			
 
				+    ndc = temp[0]
			
 
				     url = submission.EncodedAbsUrl
			
 
				     submission_date = submission.SubmissionDate
			
 
				     country = submission.Party
			
@@ -54,12 +56,12 @@ for idx, submission in submissions.iterrows():
 
				 
			
 
				     ndc_folder = "NDC_" + ndc + "_" + submission_date
			
 
				 
			
 
				-    country_folder = download_path / country
			
 
				+    country_folder = downloaded_data_path_UNFCCC / country
			
 
				     if not country_folder.exists():
			
 
				         country_folder.mkdir()
			
 
				     local_filename = country_folder / ndc_folder / url.split('/')[-1]
			
 
				     local_filename_underscore = \
			
 
				-        download_path / country / ndc_folder / \
			
 
				+        downloaded_data_path_UNFCCC / country / ndc_folder / \
			
 
				         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
			
 
				     if not local_filename.parent.exists():
			
 
				         local_filename.parent.mkdir()
			
@@ -73,7 +75,8 @@ for idx, submission in submissions.iterrows():
 
				             os.remove(local_filename_underscore)
			
 
				     
			
 
				     # now we have to remove error pages, so a present file should not be overwritten
			
 
				-    if not local_filename_underscore.exists():
			
 
				+    if (not local_filename_underscore.exists()) \
			
 
				+            and (not local_filename_underscore.is_symlink()):
			
 
				         i = 0  # reset counter
			
 
				         while not local_filename_underscore.exists() and i < 10:
			
 
				 
			
@@ -102,4 +105,4 @@ for idx, submission in submissions.iterrows():
 
				 
			
 
				 
			
 
				 df = pd.DataFrame(new_downloaded)
			
 
				-df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				 from pathlib import Path
			
 
				-
			
 
				-root = Path(__file__).parents[2]
			
 
				+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
			
 
				 
			
 
				 ###############
			
 
				 #
			
@@ -45,8 +44,7 @@ else:
 
				 error_file_sizes = [212, 210]
			
 
				 
			
 
				 # Read submissions list
			
 
				-download_path = root / "downloaded_data" / "UNFCCC"
			
 
				-submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
			
 
				+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{category.lower()}.csv")
			
 
				 
			
 
				 # set options for headless mode
			
 
				 profile_path = ".firefox"
			
@@ -82,7 +80,7 @@ for idx, submission in submissions.iterrows():
 
				     country = country.replace(' ', '_')
			
 
				     print(f"Downloading {title} from {url}")
			
 
				 
			
 
				-    country_folder = download_path / country
			
 
				+    country_folder = downloaded_data_path_UNFCCC / country
			
 
				     if not country_folder.exists():
			
 
				         country_folder.mkdir()
			
 
				     local_filename = \
			
@@ -98,7 +96,7 @@ for idx, submission in submissions.iterrows():
 
				             os.remove(local_filename)
			
 
				     
			
 
				     # now we have removed error pages, so a present file should not be overwritten
			
 
				-    if not local_filename.exists():
			
 
				+    if (not local_filename.exists()) and (not local_filename.is_symlink()):
			
 
				         i = 0  # reset counter
			
 
				         while not local_filename.exists() and i < 10:
			
 
				             # for i = 0 and i = 5 try to get a new session ID
			
@@ -129,14 +127,15 @@ for idx, submission in submissions.iterrows():
 
				             
			
 
				         if local_filename.exists():
			
 
				             new_downloaded.append(submission)
			
 
				-            print(f"Download => {local_filename.relative_to(root)}")
			
 
				+            print(f"Download => {local_filename.relative_to(root_path)}")
			
 
				         else:
			
 
				-            print(f"Failed to download {local_filename.relative_to(root)}")
			
 
				+            print(f"Failed to download {local_filename.relative_to(root_path)}")
			
 
				 
			
 
				     else:
			
 
				-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
			
 
				+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
			
 
				 
			
 
				 driver.close()
			
 
				 
			
 
				 df = pd.DataFrame(new_downloaded)
			
 
				-df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC /
			
 
				+          f"00_new_downloads_{category}-{date.today()}.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
@@ -8,8 +8,7 @@ from selenium.webdriver import Firefox
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				 from unfccc_submission_info import get_unfccc_submission_info
			
 
				-
			
 
				-root = Path(__file__).absolute().parents[2]
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				 
			
 
				 max_tries = 10
			
 
				 
			
@@ -143,4 +142,4 @@ if len(no_downloads) > 0:
 
				 
			
 
				 driver.close()
			
 
				 df = pd.DataFrame(downloads)
			
 
				-df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				 from unfccc_submission_info import get_unfccc_submission_info
			
 
				-
			
 
				-root = Path(__file__).absolute().parents[2]
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				 
			
 
				 """
			
 
				 Download UNFCCC Biennial Update Report submissions
			
@@ -84,4 +83,4 @@ if len(no_downloads) > 0:
 
				 driver.close()
			
 
				 df = pd.DataFrame(downloads)
			
 
				 df = df[["Kind", "Country", "Title", "URL"]]
			
 
				-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC / "submissions-bur.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
+++ b/UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
@@ -8,9 +8,9 @@ from bs4 import BeautifulSoup
 
				 from selenium.webdriver import Firefox
			
 
				 from selenium.webdriver.firefox.options import Options
			
 
				 from random import randrange
			
 
				-from unfccc_submission_info import get_unfccc_submission_info
			
 
				-
			
 
				-root = Path(__file__).absolute().parents[2]
			
 
				+from UNFCCC_GHG_data.UNFCCC_downloader import \
			
 
				+    get_unfccc_submission_info
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
			
 
				 
			
 
				 """
			
 
				 Download UNFCCC Biennial Update Report submissions
			
@@ -85,4 +85,4 @@ if len(no_downloads) > 0:
 
				 driver.close()
			
 
				 df = pd.DataFrame(downloads)
			
 
				 df = df[["Kind", "Country", "Title", "URL"]]
			
 
				-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)
			
 
				+df.to_csv(downloaded_data_path_UNFCCC / "submissions-nc.csv", index=False)
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
@@ -6,7 +6,7 @@ import sys
 
				 import camelot
			
 
				 import primap2 as pm2
			
 
				 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
			
 
				-from pathlib import Path
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
@@ -16,11 +16,6 @@ from pathlib import Path
 
				 #  PRIMAP2 version
			
 
				 
			
 
				 # folders and files
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
			
 
				                'BUR4'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
@@ -5,9 +5,9 @@ import os
 
				 import sys
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 
			
 
				 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 from primap2.pm2io._data_reading import matches_time_format
			
 
				 from primap2.pm2io._data_reading import filter_data
			
 
				 
			
@@ -16,11 +16,6 @@ from primap2.pm2io._data_reading import filter_data
 
				 # ###
			
 
				 
			
 
				 # folders and files
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
@@ -4,19 +4,12 @@
 
				 
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 from primap2.pm2io._data_reading import matches_time_format
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
@@ -4,21 +4,14 @@
 
				 
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 import camelot
			
 
				 import numpy as np
			
 
				 from primap2.pm2io._data_reading import matches_time_format
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
@@ -3,20 +3,13 @@
 
				 
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 import camelot
			
 
				 from config_MEX_BUR3 import page_defs, fix_rows
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
@@ -5,25 +5,19 @@ import camelot
 
				 import primap2 as pm2
			
 
				 import pandas as pd
			
 
				 import copy
			
 
				-from pathlib import Path
			
 
				+
			
 
				 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
			
 
				     table_defs, header_defs
			
 
				 from primap2.pm2io._data_reading import matches_time_format, filter_data
			
 
				+from UNFCCC_GHG_data.helper import extracted_data_path, downloaded_data_path
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
			
 
				 output_filename = 'MAR_BUR3_2022_'
			
 
				-
			
 
				 inventory_file = 'Morocco_BUR3_Fr.pdf'
			
 
				-
			
 
				 gwp_to_use = 'AR4GWP100'
			
 
				 
			
 
				 # years to read
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
@@ -1,28 +1,21 @@
 
				-# this script reads data from Korea's 2021 national inventory
			
 
				+# this script reads data from Korea's 2021 national inventory which is underlying BUR4
			
 
				 # Data is read from the xlsx file
			
 
				 
			
 
				 import os
			
 
				 import sys
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 
			
 
				 from config_KOR_BUR4 import cat_name_translations, cat_codes
			
 
				 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
			
 
				     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
			
 
				     filter_remove_after_agg
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 from primap2.pm2io._data_reading import filter_data, matches_time_format
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
			
 
				                '2021-Inventory'
			
 
				 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
@@ -5,19 +5,14 @@ import os
 
				 import sys
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 from config_KOR_BUR4 import cat_name_translations, cat_codes
			
 
				 from primap2.pm2io._data_reading import filter_data
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
			
 
				                '2020-Inventory'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
@@ -3,11 +3,10 @@
 
				 
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 import camelot
			
 
				 import copy
			
 
				-#import re
			
 
				 
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 from primap2.pm2io._data_reading import matches_time_format
			
 
				 
			
 
				 from config_TWN_NIR2022 import table_defs, page_defs
			
@@ -17,12 +16,6 @@ from config_TWN_NIR2022 import gwp_to_use
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
			
 
				 # TODO: move file to subfolder
			
 
				 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
@@ -1,23 +1,16 @@
 
				 # this script reads data from Thailand's BUR3
			
 
				 # Data is read from the pdf file
			
 
				-
			
 
				 import pandas as pd
			
 
				 import primap2 as pm2
			
 
				-from pathlib import Path
			
 
				 import camelot
			
 
				 import copy
			
 
				 
			
 
				+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
			
 
				 from primap2.pm2io._data_reading import matches_time_format
			
 
				 
			
 
				 # ###
			
 
				 # configuration
			
 
				 # ###
			
 
				-root_path = Path(__file__).parents[3].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-
			
 
				-
			
 
				 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
			
 
				 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
			
 
				 if not output_folder.exists():
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/__init__.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/__init__.py
@@ -1,6 +1 @@
 
				-# expose some of the functions to the outside as they are used in other readers as well
			
 
				-# TODO: create a unified util module for all readers
			
 
				-
			
 
				-from .get_submissions_info import get_country_code
			
 
				-
			
 
				-__all__ = ["get_country_code"]
			
 
				+#
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
@@ -5,324 +5,11 @@ from typing import List, Dict
 
				 from pathlib import Path
			
 
				 import json
			
 
				 import pycountry
			
 
				-#import os
			
 
				 
			
 
				-root_path = Path(__file__).parents[2].absolute()
			
 
				-root_path = root_path.resolve()
			
 
				-code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
			
 
				-# beware, folders below are different than for CRF reader
			
 
				-downloaded_data_path = root_path / "downloaded_data"
			
 
				-extracted_data_path = root_path / "extracted_data"
			
 
				-legacy_data_path = root_path / "legacy_data"
			
 
				-
			
 
				-# TODO: move this to general util package
			
 
				-custom_country_mapping = {
			
 
				-    "EUA": "European Union",
			
 
				-    "EUC": "European Union",
			
 
				-    "FRK": "France",
			
 
				-    "DKE": "Denmark",
			
 
				-    "DNM": "Denmark",
			
 
				-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
			
 
				-}
			
 
				-
			
 
				-custom_folders = {
			
 
				-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
			
 
				-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
			
 
				-    'Micronesia_(Federated_State_of)': 'FSM',
			
 
				-    'Micronesia_(Federated_States_of)': 'FSM',
			
 
				-    'The_Republic_of_North_Macedonia': 'MKD',
			
 
				-    'Republic_of_Korea': 'KOR',
			
 
				-    'Bolivia_(Plurinational_State_of)': 'BOL',
			
 
				-    'Türkiye': 'TUR',
			
 
				-    'Iran_(Islamic_Republic_of)': 'IRN',
			
 
				-    'Côte_d’Ivoire': 'CIV',
			
 
				-    'Democratic_Republic_of_the_Congo': "COD",
			
 
				-    'European_Union': 'EUA',
			
 
				-    'Taiwan': 'TWN',
			
 
				-}
			
 
				-
			
 
				-def get_country_submissions(
			
 
				-        country_name: str,
			
 
				-        print_sub: bool = True,
			
 
				-) -> Dict[str, List[str]]:
			
 
				-    """
			
 
				-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
			
 
				-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
			
 
				-    queries the folder mapping files for folders.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-        country_name: str
			
 
				-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
			
 
				-
			
 
				-        print_sub: bool
			
 
				-            If True information on submissions will be written to stdout
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
			
 
				-        Each value is a list of folders
			
 
				-
			
 
				-    """
			
 
				-
			
 
				-    data_folder = downloaded_data_path
			
 
				-
			
 
				-    country_code = get_country_code(country_name)
			
 
				-
			
 
				-    if print_sub:
			
 
				-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
			
 
				-
			
 
				-    country_submissions = {}
			
 
				-    if print_sub:
			
 
				-        print(f"#" * 80)
			
 
				-        print(f"The following submissions are available for {country_name}")
			
 
				-    for item in data_folder.iterdir():
			
 
				-        if item.is_dir():
			
 
				-            if print_sub:
			
 
				-                print("")
			
 
				-                print("-" * 80)
			
 
				-                print(f"Data folder {item.name}")
			
 
				-                print("-" * 80)
			
 
				-            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				-                folder_mapping = json.load(mapping_file)
			
 
				-            if country_code in folder_mapping:
			
 
				-                country_folders = folder_mapping[country_code]
			
 
				-                if isinstance(country_folders, str):
			
 
				-                    # only one folder
			
 
				-                    country_folders = [country_folders]
			
 
				-
			
 
				-                submission_folders = []
			
 
				-                for country_folder in country_folders:
			
 
				-                    current_folder = item / country_folder
			
 
				-                    if print_sub:
			
 
				-                        print(f"Submissions in folder {country_folder}:")
			
 
				-
			
 
				-                    for submission_folder in current_folder.iterdir():
			
 
				-                        if submission_folder.is_dir():
			
 
				-                            if print_sub:
			
 
				-                                print(submission_folder.name)
			
 
				-                            submission_folders.append(submission_folder.name)
			
 
				-
			
 
				-                country_submissions[item.name] = submission_folders
			
 
				-            else:
			
 
				-                print(f"No submissions available for {country_name}.")
			
 
				-
			
 
				-    return country_submissions
			
 
				-
			
 
				-
			
 
				-def get_country_datasets(
			
 
				-        country_name: str,
			
 
				-        print_ds: bool = True,
			
 
				-) -> Dict[str, List[str]]:
			
 
				-    """
			
 
				-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the country's name.
			
 
				-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
			
 
				-    checks the UNFCCC_GHG_data and data folders for content on the country.
			
 
				-
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-        country_name: str
			
 
				-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
			
 
				-
			
 
				-        print_ds: bool
			
 
				-            If True information on submissions will be written to stdout
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
			
 
				-        Each value is a list of folders
			
 
				-
			
 
				-    """
			
 
				-
			
 
				-    data_folder = extracted_data_path
			
 
				-    data_folder_legacy = legacy_data_path
			
 
				-
			
 
				-
			
 
				-    # obtain country UNFCCC_GHG_data
			
 
				-    country_code = get_country_code(country_name)
			
 
				-
			
 
				-    if print_ds:
			
 
				-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
			
 
				+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path, extracted_data_path
			
 
				+from UNFCCC_GHG_data.helper import get_country_code
			
 
				 
			
 
				-    rep_data = {}
			
 
				-    # data
			
 
				-    if print_ds:
			
 
				-        print(f"#" * 80)
			
 
				-        print(f"The following datasets are available for {country_name}")
			
 
				-    for item in data_folder.iterdir():
			
 
				-        if item.is_dir():
			
 
				-            cleaned_datasets_current_folder = {}
			
 
				-            if print_ds:
			
 
				-                print("-" * 80)
			
 
				-                print(f"Data folder {item.name}")
			
 
				-                print("-" * 80)
			
 
				-            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				-                folder_mapping = json.load(mapping_file)
			
 
				-            if country_code not in folder_mapping:
			
 
				-                if print_ds:
			
 
				-                    print("No data available")
			
 
				-                    print("")
			
 
				-            else:
			
 
				-                country_folder = folder_mapping[country_code]
			
 
				-                if not isinstance(country_folder, str):
			
 
				-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				-
			
 
				-                datasets_current_folder = {}
			
 
				-                current_folder = item / country_folder
			
 
				-
			
 
				-                for data_file in current_folder.iterdir():
			
 
				-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				-                        if data_file.stem in datasets_current_folder:
			
 
				-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				-                        else:
			
 
				-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				-
			
 
				-                for dataset in datasets_current_folder:
			
 
				-                    # process filename to get submission
			
 
				-                    parts = dataset.split('_')
			
 
				-                    if parts[0] != country_code:
			
 
				-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
			
 
				-                    else:
			
 
				-                        terminology = "_".join(parts[3 : ])
			
 
				-                        key = f"{parts[1]} ({parts[2]}, {terminology})"
			
 
				-                        data_info = ""
			
 
				-                        if '.nc' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "NF (.nc), "
			
 
				-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				-                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				-                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				-                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				-
			
 
				-                        code_file = get_code_file(country_code, parts[1])
			
 
				-                        if code_file:
			
 
				-                            data_info = data_info + f"UNFCCC_GHG_data: {code_file.name}"
			
 
				-                        else:
			
 
				-                            data_info = data_info + f"UNFCCC_GHG_data: not found"
			
 
				-
			
 
				-                        cleaned_datasets_current_folder[key] = data_info
			
 
				-
			
 
				-                if print_ds:
			
 
				-                    if cleaned_datasets_current_folder:
			
 
				-                        for country_ds in cleaned_datasets_current_folder:
			
 
				-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				-                    else:
			
 
				-                        print("No data available")
			
 
				-                    print("")
			
 
				-
			
 
				-            rep_data[item.name] = cleaned_datasets_current_folder
			
 
				-
			
 
				-    # legacy data
			
 
				-    if print_ds:
			
 
				-        print(f"#" * 80)
			
 
				-        print(f"The following legacy datasets are available for {country_name}")
			
 
				-    legacy_data = {}
			
 
				-    for item in data_folder_legacy.iterdir():
			
 
				-        if item.is_dir():
			
 
				-            cleaned_datasets_current_folder = {}
			
 
				-            if print_ds:
			
 
				-                print("-" * 80)
			
 
				-                print(f"Data folder {item.name}")
			
 
				-                print("-" * 80)
			
 
				-            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				-                folder_mapping = json.load(mapping_file)
			
 
				-            if country_code not in folder_mapping:
			
 
				-                if print_ds:
			
 
				-                    print("No data available")
			
 
				-                    print("")
			
 
				-            else:
			
 
				-                country_folder = folder_mapping[country_code]
			
 
				-                if not isinstance(country_folder, str):
			
 
				-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				-
			
 
				-                datasets_current_folder = {}
			
 
				-                current_folder = item / country_folder
			
 
				-
			
 
				-                for data_file in current_folder.iterdir():
			
 
				-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				-                        if data_file.stem in datasets_current_folder:
			
 
				-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				-                        else:
			
 
				-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				-
			
 
				-                for dataset in datasets_current_folder:
			
 
				-                    # process filename to get submission
			
 
				-                    parts = dataset.split('_')
			
 
				-                    if parts[0] != country_code:
			
 
				-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
			
 
				-                    else:
			
 
				-                        terminology = "_".join(parts[3 : ])
			
 
				-                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
			
 
				-                        data_info = ""
			
 
				-                        if '.nc' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "NF (.nc), "
			
 
				-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				-                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				-                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				-                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				-                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				-
			
 
				-                        cleaned_datasets_current_folder[key] = data_info
			
 
				-
			
 
				-                if print_ds:
			
 
				-                    if cleaned_datasets_current_folder:
			
 
				-                        for country_ds in cleaned_datasets_current_folder:
			
 
				-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				-                    else:
			
 
				-                        print("No data available")
			
 
				-                    print("")
			
 
				-
			
 
				-                legacy_data[item.name] = cleaned_datasets_current_folder
			
 
				-
			
 
				-    all_data = {
			
 
				-        "rep_data": rep_data,
			
 
				-        "legacy_data": legacy_data,
			
 
				-    }
			
 
				-
			
 
				-    return all_data
			
 
				-
			
 
				-
			
 
				-def get_country_code(
			
 
				-        country_name: str,
			
 
				-)->str:
			
 
				-    """
			
 
				-    obtain country UNFCCC_GHG_data. If the input is a UNFCCC_GHG_data it will be returned, if the input
			
 
				-    is not a three letter UNFCCC_GHG_data a search will be performed
			
 
				-
			
 
				-    Parameters
			
 
				-    __________
			
 
				-    country_name: str
			
 
				-        Country UNFCCC_GHG_data or name to get the three-letter UNFCCC_GHG_data for.
			
 
				-
			
 
				-    """
			
 
				-    # First check if it's in the list of custom codes
			
 
				-    if country_name in custom_country_mapping:
			
 
				-        country_code = country_name
			
 
				-    else:
			
 
				-        try:
			
 
				-            # check if it's a 3 letter UNFCCC_GHG_data
			
 
				-            country = pycountry.countries.get(alpha_3=country_name)
			
 
				-            country_code = country.alpha_3
			
 
				-        except:
			
 
				-            try:
			
 
				-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
			
 
				-            except:
			
 
				-                raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
			
 
				-            if len(country) > 1:
			
 
				-                country_code = None
			
 
				-                for current_country in country:
			
 
				-                    if current_country.name == country_name:
			
 
				-                        country_code = current_country.alpha_3
			
 
				-                if country_code is None:
			
 
				-                    raise ValueError(f"Country name {country_name} has {len(country)} "
			
 
				-                                     f"possible results for country codes.")
			
 
				-
			
 
				-            country_code = country[0].alpha_3
			
 
				-
			
 
				-    return country_code
			
 
				+code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
			
 
				 
			
 
				 
			
 
				 def get_possible_inputs(
			
@@ -446,128 +133,7 @@ def get_possible_outputs(
 
				     return output_files
			
 
				 
			
 
				 
			
 
				-def get_code_file(
			
 
				-        country_name: str,
			
 
				-        submission: str,
			
 
				-        print_info: bool = False,
			
 
				-) -> Path:
			
 
				-    """
			
 
				-    For given country name and submission find the script that creates the data
			
 
				 
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-        country_name: str
			
 
				-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
			
 
				-
			
 
				-        submission: str
			
 
				-            String of the submission
			
 
				-
			
 
				-        print_info: bool = False
			
 
				-            If True print information on UNFCCC_GHG_data found
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-        returns a pathlib Path object for the UNFCCC_GHG_data file
			
 
				-    """
			
 
				-
			
 
				-    code_file_path = None
			
 
				-
			
 
				-    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
			
 
				-    # so we return the path to that.
			
 
				-    if submission[0:3] == "CRF":
			
 
				-        return root_path / "UNFCCC_CRF_reader"
			
 
				 
			
 
				-    # obtain country UNFCCC_GHG_data
			
 
				-    country_code = get_country_code(country_name)
			
 
				-
			
 
				-    if print_info:
			
 
				-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
			
 
				-
			
 
				-    with open(code_path / "folder_mapping.json", "r") as mapping_file:
			
 
				-        folder_mapping = json.load(mapping_file)
			
 
				-
			
 
				-    if country_code not in folder_mapping:
			
 
				-        if print_info:
			
 
				-            print("No UNFCCC_GHG_data available")
			
 
				-            print("")
			
 
				-    else:
			
 
				-        country_folder = code_path / folder_mapping[country_code]
			
 
				-        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
			
 
				-
			
 
				-        for file in country_folder.iterdir():
			
 
				-            if file.match(code_file_name_candidate):
			
 
				-                if code_file_path is not None:
			
 
				-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
			
 
				-                                     f"{code_file_path} and file.name. "
			
 
				-                                     f"Please use only one file with name "
			
 
				-                                     f"'read_ISO3_submission_XXX.YYY'.")
			
 
				-                else:
			
 
				-                    if print_info:
			
 
				-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
			
 
				-                code_file_path = file
			
 
				-
			
 
				-    if code_file_path is not None:
			
 
				-        return code_file_path.relative_to(root_path)
			
 
				-    else:
			
 
				-        return None
			
 
				-
			
 
				-
			
 
				-def create_folder_mapping(
			
 
				-        folder: str,
			
 
				-        extracted: bool = False
			
 
				-) -> None:
			
 
				-    """
			
 
				-    Create a mapping from 3 letter ISO country codes to folders
			
 
				-    based on the subfolders of the given folder. The mapping is
			
 
				-    stored in 'folder_mapping.json' in the given folder. Folder
			
 
				-    must be given relative to the repository root
			
 
				 
			
 
				-    Parameters
			
 
				-    ----------
			
 
				-        folder: str
			
 
				-            folder to create the mapping for
			
 
				-        extracted: bool = False
			
 
				-            If true treat the folder as extracted data, where we
			
 
				-            only have one folder per country and no typos in the
			
 
				-            names
			
 
				-
			
 
				-    Returns
			
 
				-    -------
			
 
				-        Nothing
			
 
				-
			
 
				-    """
			
 
				 
			
 
				-    folder = root_path / folder
			
 
				-    folder_mapping = {}
			
 
				-    #if not extracted:
			
 
				-    known_folders = custom_folders
			
 
				-    #else:
			
 
				-    #    known_folders = {}
			
 
				-
			
 
				-    for item in folder.iterdir():
			
 
				-        if item.is_dir() and not item.match("__pycache__"):
			
 
				-            if item.name in known_folders:
			
 
				-                ISO3 = known_folders[item.name]
			
 
				-            else:
			
 
				-                try:
			
 
				-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
			
 
				-                    if len(country) > 1:
			
 
				-                        ISO3 = None
			
 
				-                        for current_country in country:
			
 
				-                            if current_country.name == item.name.replace("_", " "):
			
 
				-                                ISO3 = current_country.alpha_3
			
 
				-                    else:
			
 
				-                        ISO3 = country[0].alpha_3
			
 
				-                except:
			
 
				-                    ISO3 = None
			
 
				-
			
 
				-            if ISO3 is None:
			
 
				-                print(f"No match for {item.name}")
			
 
				-            else:
			
 
				-                if ISO3 in folder_mapping.keys():
			
 
				-                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
			
 
				-                else:
			
 
				-                    folder_mapping[ISO3] = item.name
			
 
				-
			
 
				-    with open(folder / "folder_mapping.json", "w") as mapping_file:
			
 
				-        json.dump(folder_mapping, mapping_file, indent=4)
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
@@ -1,15 +1,12 @@
 
				 # this script takes submission and country as input (from doit) and
			
 
				 # runs the appropriate script to extract the submission data
			
 
				 
			
 
				-import sys
			
 
				 import datalad.api
			
 
				-from pathlib import Path
			
 
				 import argparse
			
 
				 from get_submissions_info import get_code_file
			
 
				 from get_submissions_info import get_possible_inputs
			
 
				 from get_submissions_info import get_possible_outputs
			
 
				-
			
 
				-
			
 
				+from UNFCCC_GHG_data.helper import root_path
			
 
				 
			
 
				 # Find the right function and possible input and output files and
			
 
				 # read the data using datalad run.
			
@@ -22,9 +19,6 @@ args = parser.parse_args()
 
				 country = args.country
			
 
				 submission = args.submission
			
 
				 
			
 
				-codepath = Path(__file__).parent
			
 
				-rootpath = codepath / ".." / ".."
			
 
				-rootpath = rootpath.resolve()
			
 
				 
			
 
				 print(f"Attempting to extract data for {submission} from {country}.")
			
 
				 print("#"*80)
			
@@ -49,7 +43,7 @@ if script_name is not None:
 
				         print("")
			
 
				     # make input files absolute to avoid datalad confusions when
			
 
				     # root directory is via symlink
			
 
				-    input_files = [rootpath / file for file in input_files]
			
 
				+    input_files = [root_path / file for file in input_files]
			
 
				     # convert file's path to str
			
 
				     input_files = [file.as_posix() for file in input_files]
			
 
				 
			
@@ -69,7 +63,7 @@ if script_name is not None:
 
				     print(f"Run the script using datalad run via the python api")
			
 
				     datalad.api.run(
			
 
				         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
			
 
				-        dataset=rootpath,
			
 
				+        dataset=root_path,
			
 
				         message=f"Read data for {country}, {submission}.",
			
 
				         inputs=input_files,
			
 
				         outputs=output_files,
			
--- a/UNFCCC_GHG_data/__init__.py
+++ b/UNFCCC_GHG_data/__init__.py
@@ -2,7 +2,12 @@
 
				 
			
 
				 from . import UNFCCC_reader
			
 
				 from . import UNFCCC_CRF_reader
			
 
				+from . import helper
			
 
				 # import UNFCCC_DI_reader
			
 
				 # import UNFCCC_downloader
			
 
				 
			
 
				-__all__ = ["UNFCCC_reader", "UNFCCC_CRF_reader"]
			
 
				+__all__ = [
			
 
				+    "UNFCCC_reader",
			
 
				+    "UNFCCC_CRF_reader",
			
 
				+    "helper",
			
 
				+]
			
--- a/UNFCCC_GHG_data/helper/__init__.py
+++ b/UNFCCC_GHG_data/helper/__init__.py
@@ -0,0 +1,27 @@
 
				+from .definitions import root_path, code_path, log_path
			
 
				+from .definitions import extracted_data_path, extracted_data_path_UNFCCC
			
 
				+from .definitions import legacy_data_path
			
 
				+from .definitions import downloaded_data_path, downloaded_data_path_UNFCCC
			
 
				+from .definitions import dataset_path, dataset_path_UNFCCC
			
 
				+from .definitions import custom_country_mapping, custom_folders
			
 
				+from .functions import get_country_code, get_country_name, convert_categories
			
 
				+from .functions import create_folder_mapping
			
 
				+
			
 
				+__all__ = [
			
 
				+    "root_path",
			
 
				+    "code_path",
			
 
				+    "log_path",
			
 
				+    "extracted_data_path",
			
 
				+    "extracted_data_path_UNFCCC",
			
 
				+    "legacy_data_path",
			
 
				+    "downloaded_data_path",
			
 
				+    "downloaded_data_path_UNFCCC",
			
 
				+    "dataset_path",
			
 
				+    "dataset_path_UNFCCC",
			
 
				+    "custom_country_mapping",
			
 
				+    "custom_folders",
			
 
				+    "get_country_code",
			
 
				+    "get_country_name",
			
 
				+    "convert_categories",
			
 
				+    "create_folder_mapping",
			
 
				+]
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/country_info.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/country_info.py
@@ -2,8 +2,8 @@
 
				 # runs displays available submissions and datasets
			
 
				 
			
 
				 import argparse
			
 
				-from get_submissions_info import get_country_submissions
			
 
				-from get_submissions_info import get_country_datasets
			
 
				+from UNFCCC_GHG_data.helper.functions import get_country_submissions
			
 
				+from UNFCCC_GHG_data.helper.functions import get_country_datasets
			
 
				 
			
 
				 # Find the right function and possible input and output files and
			
 
				 # read the data using datalad run.
			
--- a/UNFCCC_GHG_data/helper/definitions.py
+++ b/UNFCCC_GHG_data/helper/definitions.py
@@ -0,0 +1,49 @@
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+def get_root_path() -> Path:
			
 
				+    """ get the root_path from an environment variable """
			
 
				+    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
			
 
				+    if root_path_env is None:
			
 
				+        raise ValueError('UNFCCC_GHG_ROOT_PATH environment variable needs to be set')
			
 
				+    else:
			
 
				+        root_path = Path(root_path_env).resolve()
			
 
				+    return root_path
			
 
				+
			
 
				+root_path = get_root_path()
			
 
				+code_path = root_path / "UNFCCC_GHG_data"
			
 
				+log_path = root_path / "log"
			
 
				+extracted_data_path = root_path / "extracted_data"
			
 
				+extracted_data_path_UNFCCC = extracted_data_path / "UNFCCC"
			
 
				+downloaded_data_path = root_path / "downloaded_data"
			
 
				+downloaded_data_path_UNFCCC = downloaded_data_path / "UNFCCC"
			
 
				+legacy_data_path = root_path / "legacy_data"
			
 
				+dataset_path = root_path / "datasets"
			
 
				+dataset_path_UNFCCC = dataset_path / "UNFCCC"
			
 
				+
			
 
				+
			
 
				+custom_country_mapping = {
			
 
				+    "EUA": "European Union",
			
 
				+    "EUC": "European Union",
			
 
				+    "FRK": "France",
			
 
				+    "DKE": "Denmark",
			
 
				+    "DNM": "Denmark",
			
 
				+    "GBK": "United Kingdom of Great Britain and Northern Ireland",
			
 
				+}
			
 
				+
			
 
				+custom_folders = {
			
 
				+    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
			
 
				+    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
			
 
				+    'Micronesia_(Federated_State_of)': 'FSM',
			
 
				+    'Micronesia_(Federated_States_of)': 'FSM',
			
 
				+    'The_Republic_of_North_Macedonia': 'MKD',
			
 
				+    'Republic_of_Korea': 'KOR',
			
 
				+    'Bolivia_(Plurinational_State_of)': 'BOL',
			
 
				+    'Türkiye': 'TUR',
			
 
				+    'Iran_(Islamic_Republic_of)': 'IRN',
			
 
				+    'Côte_d’Ivoire': 'CIV',
			
 
				+    'Democratic_Republic_of_the_Congo': "COD",
			
 
				+    'European_Union': 'EUA',
			
 
				+    'Taiwan': 'TWN',
			
 
				+}
			
--- a/UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py
+++ b/UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py
@@ -3,7 +3,7 @@
 
				 # oir that folder
			
 
				 
			
 
				 import argparse
			
 
				-from get_submissions_info import create_folder_mapping
			
 
				+from UNFCCC_GHG_data.helper import create_folder_mapping
			
 
				 
			
 
				 # Find the right function and possible input and output files and
			
 
				 # read the data using datalad run.
			
--- a/UNFCCC_GHG_data/helper/functions.py
+++ b/UNFCCC_GHG_data/helper/functions.py
@@ -0,0 +1,510 @@
 
				+import pycountry
			
 
				+import json
			
 
				+import xarray as xr
			
 
				+from copy import deepcopy
			
 
				+from typing import Dict, List
			
 
				+from pathlib import Path
			
 
				+from .definitions import custom_country_mapping, custom_folders
			
 
				+from .definitions import root_path, downloaded_data_path, extracted_data_path
			
 
				+from .definitions import legacy_data_path, code_path
			
 
				+
			
 
				+
			
 
				+def convert_categories(
			
 
				+        ds_input: xr.Dataset,
			
 
				+        conversion: Dict[str, Dict[str, str]],
			
 
				+        #terminology_from: str,
			
 
				+        terminology_to: str,
			
 
				+        debug: bool=False,
			
 
				+        tolerance: float=0.01,
			
 
				+)->xr.Dataset:
			
 
				+    """
			
 
				+    convert data from one category terminology to another
			
 
				+    """
			
 
				+    ds_converted = ds_input.copy(deep=True)
			
 
				+    ds_converted.attrs = deepcopy(ds_input.attrs)
			
 
				+
			
 
				+    # change category terminology
			
 
				+    cat_dim = ds_converted.attrs["cat"]
			
 
				+    ds_converted.attrs["cat"] = f"category ({terminology_to})"
			
 
				+    ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
			
 
				+
			
 
				+    # find categories present in dataset
			
 
				+    cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
			
 
				+
			
 
				+    # restrict categories and map category names
			
 
				+    if 'mapping' in conversion.keys():
			
 
				+        mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
			
 
				+                                cat in cats_present]
			
 
				+        ds_converted = ds_converted.pr.loc[
			
 
				+            {'category': mapping_cats_present}]
			
 
				+
			
 
				+        from_cats = ds_converted.coords[f'category ({terminology_to})'].values
			
 
				+        to_cats = pd.Series(from_cats).replace(conversion['mapping'])
			
 
				+        ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
			
 
				+                                                   (f'category ({terminology_to})',
			
 
				+                                                    to_cats)})
			
 
				+
			
 
				+    # redo the list of present cats after mapping, as we have new categories in the
			
 
				+    # target terminology now
			
 
				+    cats_present_mapped = list(ds_converted.coords[f'category ({terminology_to})'])
			
 
				+    # aggregate categories
			
 
				+    if 'aggregate' in conversion:
			
 
				+        aggregate_cats = conversion['aggregate']
			
 
				+        for cat_to_agg in aggregate_cats:
			
 
				+            if debug:
			
 
				+                print(f"Category: {cat_to_agg}")
			
 
				+            source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
			
 
				+                           cat in cats_present_mapped]
			
 
				+            data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
			
 
				+                dim='category', skipna=True, min_count=1)
			
 
				+            nan_vars = [var for var in data_agg.data_vars if
			
 
				+                        data_agg[var].isnull().all().data == True]
			
 
				+            data_agg = data_agg.drop(nan_vars)
			
 
				+            if len(data_agg.data_vars) > 0:
			
 
				+                data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
			
 
				+                data_agg = data_agg.assign_coords(
			
 
				+                    coords={f'category ({terminology_to})':
			
 
				+                                (f'category ({terminology_to})', [cat_to_agg])})
			
 
				+                ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
			
 
				+            else:
			
 
				+                print(f"no data to aggregate category {cat_to_agg}")
			
 
				+
			
 
				+    return ds_converted
			
 
				+
			
 
				+
			
 
				+def get_country_name(
			
 
				+        country_code: str,
			
 
				+) -> str:
			
 
				+    """get country name from code """
			
 
				+    if country_code in custom_country_mapping:
			
 
				+        country_name = custom_country_mapping[country_code]
			
 
				+    else:
			
 
				+        try:
			
 
				+            country = pycountry.countries.get(alpha_3=country_code)
			
 
				+            country_name = country.name
			
 
				+        except:
			
 
				+            raise ValueError(f"Country code {country_code} can not be mapped to "
			
 
				+                             f"any country")
			
 
				+
			
 
				+    return country_name
			
 
				+
			
 
				+
			
 
				+def get_country_code(
			
 
				+        country_name: str,
			
 
				+)->str:
			
 
				+    """
			
 
				+    obtain country code. If the input is a code it will be returned,
			
 
				+    if the input
			
 
				+    is not a three letter code a search will be performed
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+    country_name: str
			
 
				+        Country code or name to get the three-letter code for.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        country_code: str
			
 
				+
			
 
				+    """
			
 
				+    # First check if it's in the list of custom codes
			
 
				+    if country_name in custom_country_mapping:
			
 
				+        country_code = country_name
			
 
				+    else:
			
 
				+        try:
			
 
				+            # check if it's a 3 letter UNFCCC_GHG_data
			
 
				+            country = pycountry.countries.get(alpha_3=country_name)
			
 
				+            country_code = country.alpha_3
			
 
				+        except:
			
 
				+            try:
			
 
				+                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
			
 
				+            except:
			
 
				+                raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
			
 
				+            if len(country) > 1:
			
 
				+                country_code = None
			
 
				+                for current_country in country:
			
 
				+                    if current_country.name == country_name:
			
 
				+                        country_code = current_country.alpha_3
			
 
				+                if country_code is None:
			
 
				+                    raise ValueError(f"Country name {country_name} has {len(country)} "
			
 
				+                                     f"possible results for country codes.")
			
 
				+
			
 
				+            country_code = country[0].alpha_3
			
 
				+
			
 
				+    return country_code
			
 
				+
			
 
				+
			
 
				+def create_folder_mapping(
			
 
				+        folder: str,
			
 
				+        extracted: bool = False
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Create a mapping from 3 letter ISO country codes to folders
			
 
				+    based on the subfolders of the given folder. The mapping is
			
 
				+    stored in 'folder_mapping.json' in the given folder. Folder
			
 
				+    must be given relative to the repository root
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        folder: str
			
 
				+            folder to create the mapping for
			
 
				+        extracted: bool = False
			
 
				+            If true treat the folder as extracted data, where we
			
 
				+            only have one folder per country and no typos in the
			
 
				+            names
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        Nothing
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    folder = root_path / folder
			
 
				+    folder_mapping = {}
			
 
				+    #if not extracted:
			
 
				+    known_folders = custom_folders
			
 
				+    #else:
			
 
				+    #    known_folders = {}
			
 
				+
			
 
				+    for item in folder.iterdir():
			
 
				+        if item.is_dir() and not item.match("__pycache__"):
			
 
				+            if item.name in known_folders:
			
 
				+                ISO3 = known_folders[item.name]
			
 
				+            else:
			
 
				+                try:
			
 
				+                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
			
 
				+                    if len(country) > 1:
			
 
				+                        ISO3 = None
			
 
				+                        for current_country in country:
			
 
				+                            if current_country.name == item.name.replace("_", " "):
			
 
				+                                ISO3 = current_country.alpha_3
			
 
				+                    else:
			
 
				+                        ISO3 = country[0].alpha_3
			
 
				+                except:
			
 
				+                    ISO3 = None
			
 
				+
			
 
				+            if ISO3 is None:
			
 
				+                print(f"No match for {item.name}")
			
 
				+            else:
			
 
				+                if ISO3 in folder_mapping.keys():
			
 
				+                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
			
 
				+                else:
			
 
				+                    folder_mapping[ISO3] = item.name
			
 
				+
			
 
				+    with open(folder / "folder_mapping.json", "w") as mapping_file:
			
 
				+        json.dump(folder_mapping, mapping_file, indent=4)
			
 
				+
			
 
				+
			
 
				+# TODO add crf
			
 
				+def get_country_submissions(
			
 
				+        country_name: str,
			
 
				+        print_sub: bool = True,
			
 
				+) -> Dict[str, List[str]]:
			
 
				+    """
			
 
				+    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
			
 
				+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
			
 
				+    queries the folder mapping files for folders.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
			
 
				+
			
 
				+        print_sub: bool
			
 
				+            If True information on submissions will be written to stdout
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
			
 
				+        Each value is a list of folders
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    data_folder = downloaded_data_path
			
 
				+
			
 
				+    country_code = get_country_code(country_name)
			
 
				+
			
 
				+    if print_sub:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    country_submissions = {}
			
 
				+    if print_sub:
			
 
				+        print(f"#" * 80)
			
 
				+        print(f"The following submissions are available for {country_name}")
			
 
				+    for item in data_folder.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            if print_sub:
			
 
				+                print("")
			
 
				+                print("-" * 80)
			
 
				+                print(f"Data folder {item.name}")
			
 
				+                print("-" * 80)
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+            if country_code in folder_mapping:
			
 
				+                country_folders = folder_mapping[country_code]
			
 
				+                if isinstance(country_folders, str):
			
 
				+                    # only one folder
			
 
				+                    country_folders = [country_folders]
			
 
				+
			
 
				+                submission_folders = []
			
 
				+                for country_folder in country_folders:
			
 
				+                    current_folder = item / country_folder
			
 
				+                    if print_sub:
			
 
				+                        print(f"Submissions in folder {country_folder}:")
			
 
				+
			
 
				+                    for submission_folder in current_folder.iterdir():
			
 
				+                        if submission_folder.is_dir():
			
 
				+                            if print_sub:
			
 
				+                                print(submission_folder.name)
			
 
				+                            submission_folders.append(submission_folder.name)
			
 
				+
			
 
				+                country_submissions[item.name] = submission_folders
			
 
				+            else:
			
 
				+                print(f"No submissions available for {country_name}.")
			
 
				+
			
 
				+    return country_submissions
			
 
				+
			
 
				+
			
 
				+def get_country_datasets(
			
 
				+        country_name: str,
			
 
				+        print_ds: bool = True,
			
 
				+) -> Dict[str, List[str]]:
			
 
				+    """
			
 
				+    Input is a three letter ISO code for a country, or the country's name.
			
 
				+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
			
 
				+    checks the UNFCCC_GHG_data and data folders for content on the country.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter code
			
 
				+
			
 
				+        print_ds: bool
			
 
				+            If True information on submissions will be written to stdout
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
			
 
				+        Each value is a list of folders
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    data_folder = extracted_data_path
			
 
				+    data_folder_legacy = legacy_data_path
			
 
				+
			
 
				+    # obtain country UNFCCC_GHG_data
			
 
				+    country_code = get_country_code(country_name)
			
 
				+
			
 
				+    if print_ds:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    rep_data = {}
			
 
				+    # data
			
 
				+    if print_ds:
			
 
				+        print(f"#" * 80)
			
 
				+        print(f"The following datasets are available for {country_name}")
			
 
				+    for item in data_folder.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            cleaned_datasets_current_folder = {}
			
 
				+            if print_ds:
			
 
				+                print("-" * 80)
			
 
				+                print(f"Data folder {item.name}")
			
 
				+                print("-" * 80)
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+            if country_code not in folder_mapping:
			
 
				+                if print_ds:
			
 
				+                    print("No data available")
			
 
				+                    print("")
			
 
				+            else:
			
 
				+                country_folder = folder_mapping[country_code]
			
 
				+                if not isinstance(country_folder, str):
			
 
				+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+
			
 
				+                datasets_current_folder = {}
			
 
				+                current_folder = item / country_folder
			
 
				+
			
 
				+                for data_file in current_folder.iterdir():
			
 
				+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                        if data_file.stem in datasets_current_folder:
			
 
				+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                        else:
			
 
				+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				+
			
 
				+                for dataset in datasets_current_folder:
			
 
				+                    # process filename to get submission
			
 
				+                    parts = dataset.split('_')
			
 
				+                    if parts[0] != country_code:
			
 
				+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
			
 
				+                            dataset
			
 
				+                    else:
			
 
				+                        terminology = "_".join(parts[3 : ])
			
 
				+                        key = f"{parts[1]} ({parts[2]}, {terminology})"
			
 
				+                        data_info = ""
			
 
				+                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "NF (.nc), "
			
 
				+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				+                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				+                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				+
			
 
				+                        code_file = get_code_file(country_code, parts[1])
			
 
				+                        if code_file:
			
 
				+                            data_info = data_info + f"code: {code_file.name}"
			
 
				+                        else:
			
 
				+                            data_info = data_info + f"code: not found"
			
 
				+
			
 
				+                        cleaned_datasets_current_folder[key] = data_info
			
 
				+
			
 
				+                if print_ds:
			
 
				+                    if cleaned_datasets_current_folder:
			
 
				+                        for country_ds in cleaned_datasets_current_folder:
			
 
				+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                    else:
			
 
				+                        print("No data available")
			
 
				+                    print("")
			
 
				+
			
 
				+            rep_data[item.name] = cleaned_datasets_current_folder
			
 
				+
			
 
				+    # legacy data
			
 
				+    if print_ds:
			
 
				+        print(f"#" * 80)
			
 
				+        print(f"The following legacy datasets are available for {country_name}")
			
 
				+    legacy_data = {}
			
 
				+    for item in data_folder_legacy.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            cleaned_datasets_current_folder = {}
			
 
				+            if print_ds:
			
 
				+                print("-" * 80)
			
 
				+                print(f"Data folder {item.name}")
			
 
				+                print("-" * 80)
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+            if country_code not in folder_mapping:
			
 
				+                if print_ds:
			
 
				+                    print("No data available")
			
 
				+                    print("")
			
 
				+            else:
			
 
				+                country_folder = folder_mapping[country_code]
			
 
				+                if not isinstance(country_folder, str):
			
 
				+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+
			
 
				+                datasets_current_folder = {}
			
 
				+                current_folder = item / country_folder
			
 
				+
			
 
				+                for data_file in current_folder.iterdir():
			
 
				+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                        if data_file.stem in datasets_current_folder:
			
 
				+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                        else:
			
 
				+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				+
			
 
				+                for dataset in datasets_current_folder:
			
 
				+                    # process filename to get submission
			
 
				+                    parts = dataset.split('_')
			
 
				+                    if parts[0] != country_code:
			
 
				+                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
			
 
				+                    else:
			
 
				+                        terminology = "_".join(parts[3 : ])
			
 
				+                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
			
 
				+                        data_info = ""
			
 
				+                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "NF (.nc), "
			
 
				+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				+                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				+                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				+
			
 
				+                        cleaned_datasets_current_folder[key] = data_info
			
 
				+
			
 
				+                if print_ds:
			
 
				+                    if cleaned_datasets_current_folder:
			
 
				+                        for country_ds in cleaned_datasets_current_folder:
			
 
				+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                    else:
			
 
				+                        print("No data available")
			
 
				+                    print("")
			
 
				+
			
 
				+                legacy_data[item.name] = cleaned_datasets_current_folder
			
 
				+
			
 
				+    all_data = {
			
 
				+        "rep_data": rep_data,
			
 
				+        "legacy_data": legacy_data,
			
 
				+    }
			
 
				+
			
 
				+    return all_data
			
 
				+
			
 
				+
			
 
				+def get_code_file(
			
 
				+        country_name: str,
			
 
				+        submission: str,
			
 
				+        print_info: bool = False,
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    For given country name and submission find the script that creates the data
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
			
 
				+
			
 
				+        submission: str
			
 
				+            String of the submission
			
 
				+
			
 
				+        print_info: bool = False
			
 
				+            If True print information on UNFCCC_GHG_data found
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a pathlib Path object for the UNFCCC_GHG_data file
			
 
				+    """
			
 
				+
			
 
				+    code_file_path = None
			
 
				+    UNFCCC_reader_path = code_path / "UNFCCC_reader"
			
 
				+
			
 
				+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
			
 
				+    # so we return the path to that.
			
 
				+    if submission[0:3] == "CRF":
			
 
				+        return root_path / "UNFCCC_CRF_reader"
			
 
				+
			
 
				+    if submission[0:2] == "DI":
			
 
				+        return root_path / "UNFCCC_DI_reader"
			
 
				+
			
 
				+    # obtain country UNFCCC_GHG_data
			
 
				+    country_code = get_country_code(country_name)
			
 
				+
			
 
				+    if print_info:
			
 
				+        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
			
 
				+
			
 
				+    with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code not in folder_mapping:
			
 
				+        if print_info:
			
 
				+            print("No UNFCCC_GHG_data available")
			
 
				+            print("")
			
 
				+    else:
			
 
				+        country_folder = UNFCCC_reader_path / folder_mapping[country_code]
			
 
				+        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
			
 
				+
			
 
				+        for file in country_folder.iterdir():
			
 
				+            if file.match(code_file_name_candidate):
			
 
				+                if code_file_path is not None:
			
 
				+                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
			
 
				+                                     f"{code_file_path} and file.name. "
			
 
				+                                     f"Please use only one file with name "
			
 
				+                                     f"'read_ISO3_submission_XXX.YYY'.")
			
 
				+                else:
			
 
				+                    if print_info:
			
 
				+                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
			
 
				+                code_file_path = file
			
 
				+
			
 
				+    if code_file_path is not None:
			
 
				+        return code_file_path.relative_to(root_path)
			
 
				+    else:
			
 
				+        return None
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.csv
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.nc
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.yaml
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/21/Q3/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/Xk/7m/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc
			
--- a/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml
+++ b/datasets/UNFCCC/DI_AnnexI/DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.yaml
@@ -0,0 +1,29 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  title: 'Data submitted by the following AnnexI countries and available in the DI
			
 
				+    interface on 2023-05-24: AUS, AUT, BEL, BGR, BLR'
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (CRFDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - category (CRFDI)
			
 
				+  - provenance
			
 
				+  - measure
			
 
				+  - scenario (Access_Date)
			
 
				+  - source
			
 
				+  - class
			
 
				+  - area (ISO3)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+data_file: DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.csv
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.nc
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.yaml
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/kJ/1w/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/p0/9q/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.yaml
@@ -0,0 +1,40 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+  title: 'Data submitted by the following non-AnnexI countries and available in the
			
 
				+    DI interface on 2023-05-24: AFG, AGO, ALB, ARE, ARG, ARM, ATG, AZE, BDI, BEN,
			
 
				+    BFA, BGD, BHR, BHS, BIH, BLZ, BOL, BRA, BRB, BRN, BTN, BWA, CAF, CHL, CHN, CIV,
			
 
				+    CMR, COD, COG, COK, COL, COM, CPV, CRI, CUB, DJI, DMA, DOM, DZA, ECU, EGY, ERI,
			
 
				+    ETH, FJI, FSM, GAB, GEO, GHA, GIN, GMB, GNB, GRD, GTM, GUY, HND, HTI, IDN, IND,
			
 
				+    IRN, IRQ, ISR, JAM, JOR, KEN, KGZ, KHM, KIR, KNA, KOR, KWT, LAO, LBN, LBR, LCA,
			
 
				+    LKA, LSO, MAR, MDA, MDG, MDV, MEX, MHL, MKD, MLI, MMR, MNE, MNG, MOZ, MRT, MUS,
			
 
				+    MWI, MYS, NAM, NER, NGA, NIC, NIU, NPL, NRU, OMN, PAK, PAN, PER, PHL, PLW, PNG,
			
 
				+    PRK, PRY, PSE, QAT, RWA, SAU, SDN, SEN, SGP, SLB, SLV, SMR, SRB, SSD, STP, SUR,
			
 
				+    SWZ, SYC, SYR, TCD, TGO, THA, TJK, TKM, TLS, TON, TTO, TUN, TUV, TZA, UGA, URY,
			
 
				+    UZB, VCT, VEN, VNM, VUT, WSM, YEM, ZAF, ZMB, ZWE'
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - area (ISO3)
			
 
				+  - class
			
 
				+  - provenance
			
 
				+  - measure
			
 
				+  - source
			
 
				+  - scenario (Access_Date)
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/Pg/q5/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.nc
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/x0/z5/MD5E-s250365--f718595d2232f23579164612e71192c7.nc/MD5E-s250365--f718595d2232f23579164612e71192c7.nc
			
--- a/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.yaml
+++ b/datasets/UNFCCC/DI_non_AnnexI/DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.yaml
@@ -0,0 +1,29 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - source
			
 
				+  - measure
			
 
				+  - class
			
 
				+  - scenario (Access_Date)
			
 
				+  - area (ISO3)
			
 
				+  - provenance
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: DI_non_AnnexI_c10c3850dad21f4182cfdac530b6b49e_raw_hash.csv
			
--- a/datasets/UNFCCC/no_updates_until.txt
+++ b/datasets/UNFCCC/no_updates_until.txt
@@ -0,0 +1,6 @@
 
				+22/08/30: data is identical to 22/08/22
			
 
				+22/09/26: data is identical to 22/08/22
			
 
				+22/10/13: data is identical to 22/08/22
			
 
				+22/10/29: data updates for BHS, LKA, MDA (just 0 instead of nan), MYS, PAK (new data), PRY (0 instead of nan), TCD (new data)
			
 
				+23/01/23: data is identical to 22/10/29
			
 
				+
			
--- a/dodo.py
+++ b/dodo.py
@@ -1,5 +1,6 @@
 
				 # define tasks for UNFCCC data repository
			
 
				 from doit import get_var
			
 
				+import os
			
 
				 
			
 
				 # TODO: task for folder mapping
			
 
				 
			
@@ -18,6 +19,18 @@ def task_setup_venv():
 
				         'verbosity': 2,
			
 
				     }
			
 
				 
			
 
				+# set UNFCCC_GHG_ROOT_PATH environment variable
			
 
				+def task_set_env():
			
 
				+    """
			
 
				+    Set the environment variable for the module so data is stored in the correct folders
			
 
				+    """
			
 
				+    def set_root_path():
			
 
				+        os.environ["UNFCCC_GHG_ROOT_PATH"] = "."
			
 
				+
			
 
				+    return {
			
 
				+        'actions': [set_root_path],
			
 
				+    }
			
 
				+
			
 
				 
			
 
				 # Task to create the mapping files which map folder names to ISO 3-letter country codes
			
 
				 read_config_folder = {
			
@@ -29,8 +42,9 @@ def task_map_folders():
 
				     Create or update the folder mapping in the given folder
			
 
				     """
			
 
				     return {
			
 
				-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder={read_config_folder['folder']}"],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -44,6 +58,7 @@ def task_update_bur():
 
				         'actions': ['datalad run -m "Fetch BUR submissions" '
			
 
				                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
			
 
				                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -58,9 +73,10 @@ def task_download_bur():
 
				         'actions': ['datalad run -m "Download BUR submissions" '
			
 
				                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
			
 
				                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
			
 
				-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder=downloaded_data/UNFCCC"
			
 
				                     ],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -73,6 +89,7 @@ def task_update_nc():
 
				         'actions': ['datalad run -m "Fetch NC submissions" '
			
 
				                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
			
 
				                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -87,9 +104,10 @@ def task_download_nc():
 
				         'actions': ['datalad run -m "Download NC submissions" '
			
 
				                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
			
 
				                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
			
 
				-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder=downloaded_data/UNFCCC"
			
 
				                     ],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -112,6 +130,7 @@ def task_update_annexi():
 
				                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
			
 
				                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
			
 
				                     f"--year={update_aI_config['year']}"],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -128,9 +147,10 @@ def task_download_annexi():
 
				                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
			
 
				                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
			
 
				                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
			
 
				-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder=downloaded_data/UNFCCC"
			
 
				                     ],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -141,9 +161,10 @@ def task_download_ndc():
 
				     return {
			
 
				         'actions': ['datalad run -m "Download NDC submissions" '
			
 
				                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
			
 
				-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder=downloaded_data/UNFCCC"
			
 
				                     ],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -163,9 +184,10 @@ def task_read_unfccc_submission():
 
				     return {
			
 
				         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
			
 
				                     f"--country={read_config['country']} --submission={read_config['submission']}",
			
 
				-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                     f"--folder=extracted_data/UNFCCC"
			
 
				                     ],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -188,13 +210,14 @@ def task_read_unfccc_crf_submission():
 
				         f"--country={read_config_crf['country']} "
			
 
				         f"--submission_year={read_config_crf['submission_year']} "
			
 
				         f"--submission_date={read_config_crf['submission_date']} ",
			
 
				-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				         f"--folder=extracted_data/UNFCCC"
			
 
				         ]
			
 
				     if read_config_crf["re_read"] == "True":
			
 
				         actions[0] = actions[0] + " --re_read"
			
 
				     return {
			
 
				         'actions': actions,
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
@@ -205,7 +228,7 @@ def task_read_new_unfccc_crf_for_year():
 
				     data not present yet. Only reads the latest updated submission for each country."""
			
 
				     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
			
 
				                f"--submission_year={read_config_crf['submission_year']} ",
			
 
				-               f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
			
 
				+               f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				                f"--folder=extracted_data/UNFCCC"
			
 
				                ]
			
 
				     # specifying countries is currently disabled duo to problems with command line
			
@@ -217,17 +240,79 @@ def task_read_new_unfccc_crf_for_year():
 
				     return {
			
 
				         #'basename': "Read_CRF_year",
			
 
				         'actions': actions,
			
 
				+        'task_dep': ['set_env'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+# tasks for DI reader
			
 
				+# datalad run is called from within the read_UNFCCC_DI_for_country.py script
			
 
				+read_config_di = {
			
 
				+    "country": get_var('country', None),
			
 
				+    "date": get_var('date', None),
			
 
				+    "annexI": get_var('annexI', False),
			
 
				+    #"countries": get_var('countries', None),
			
 
				+}
			
 
				+
			
 
				+def task_read_unfccc_di_for_country():
			
 
				+    """ Read DI data for a country """
			
 
				+    actions = [
			
 
				+        f"./venv/bin/python "
			
 
				+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py "
			
 
				+        f"--country={read_config_di['country']}",
			
 
				+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				+        f"--folder=extracted_data/UNFCCC"
			
 
				+        ]
			
 
				+    return {
			
 
				+        'actions': actions,
			
 
				+        'task_dep': ['set_env'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+def task_process_unfccc_di_for_country():
			
 
				+    """ Process DI data for a country """
			
 
				+    actions = [
			
 
				+        f"./venv/bin/python "
			
 
				+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py "
			
 
				+        f"--country={read_config_di['country']} --date={read_config_di['date']}",
			
 
				+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				+        f"--folder=extracted_data/UNFCCC"
			
 
				+        ]
			
 
				+    return {
			
 
				+        'actions': actions,
			
 
				+        'task_dep': ['set_env'],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				+def task_read_unfccc_di_for_country_group():
			
 
				+    """ Read DI data for a country """
			
 
				+    actions = [
			
 
				+        f"./venv/bin/python "
			
 
				+        f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_group_datalad.py",
			
 
				+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
			
 
				+        f"--folder=extracted_data/UNFCCC"
			
 
				+        ]
			
 
				+    if read_config_di["annexI"] == "True":
			
 
				+        actions[0] = actions[0] + " --annexI"
			
 
				+
			
 
				+    return {
			
 
				+        'actions': actions,
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
 
				 
			
 
				 
			
 
				+# general tasks
			
 
				 def task_country_info():
			
 
				     """ Print information on submissions and datasets
			
 
				     available for given country"""
			
 
				     return {
			
 
				-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/country_info.py "
			
 
				+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/country_info.py "
			
 
				                     f"--country={read_config['country']}"],
			
 
				+        'task_dep': ['set_env'],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
--- a/downloaded_data/UNFCCC/00_new_downloads_BUR-2023-05-25.csv
+++ b/downloaded_data/UNFCCC/00_new_downloads_BUR-2023-05-25.csv
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/xK/JK/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv
			
--- a/downloaded_data/UNFCCC/00_new_downloads_CRF2023-2023-05-25.csv
+++ b/downloaded_data/UNFCCC/00_new_downloads_CRF2023-2023-05-25.csv
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/19/v1/MD5E-s301--5c7bf2049ef674fc24e8b6cd815d3ec5.csv/MD5E-s301--5c7bf2049ef674fc24e8b6cd815d3ec5.csv
			
--- a/downloaded_data/UNFCCC/00_new_downloads_NC-2023-05-26.csv
+++ b/downloaded_data/UNFCCC/00_new_downloads_NC-2023-05-26.csv
@@ -0,0 +1 @@
 
				+../../.git/annex/objects/v2/00/MD5E-s729--a132ceae9eb74eef0dfa35ff35e934f7.csv/MD5E-s729--a132ceae9eb74eef0dfa35ff35e934f7.csv
			
--- a/downloaded_data/UNFCCC/Bosnia_and_Herzegovina/BUR3/TBUR_BiH_Oct__2022_ENG.pdf
+++ b/downloaded_data/UNFCCC/Bosnia_and_Herzegovina/BUR3/TBUR_BiH_Oct__2022_ENG.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/vm/Pz/MD5E-s2296920--54671aebed597ce15ba33412130564a9.pdf/MD5E-s2296920--54671aebed597ce15ba33412130564a9.pdf
			
--- a/downloaded_data/UNFCCC/Bosnia_and_Herzegovina/NC4/FNC_BiH_ENG_fin.pdf
+++ b/downloaded_data/UNFCCC/Bosnia_and_Herzegovina/NC4/FNC_BiH_ENG_fin.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/m7/V6/MD5E-s8376835--b8cc7123b927e5cfce50b3997039ac21.pdf/MD5E-s8376835--b8cc7123b927e5cfce50b3997039ac21.pdf
			
--- a/downloaded_data/UNFCCC/Micronesia_(Federated_States_of)/BUR1/NC3_BUR1_MICRONESIA_UNFCCC.pdf
+++ b/downloaded_data/UNFCCC/Micronesia_(Federated_States_of)/BUR1/NC3_BUR1_MICRONESIA_UNFCCC.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/V2/z6/MD5E-s13644421--5a8de76f99ee7a2bdabad7c064d4d986.pdf/MD5E-s13644421--5a8de76f99ee7a2bdabad7c064d4d986.pdf
			
--- a/downloaded_data/UNFCCC/Nicaragua/NC4/4CN-Nicaragua.pdf
+++ b/downloaded_data/UNFCCC/Nicaragua/NC4/4CN-Nicaragua.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/g1/9g/MD5E-s22028489--551918780966395e2e73c30eaf7954f2.pdf/MD5E-s22028489--551918780966395e2e73c30eaf7954f2.pdf
			
--- a/downloaded_data/UNFCCC/Niger/BUR1/92876103_Niger-BUR1-1-PREMIER_RAPPORT_BIENNAL_ACTUALISE_DU_NIGER.pdf
+++ b/downloaded_data/UNFCCC/Niger/BUR1/92876103_Niger-BUR1-1-PREMIER_RAPPORT_BIENNAL_ACTUALISE_DU_NIGER.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/pm/VQ/MD5E-s5077565--a2fb630d0a395c7b722eb561cfff230b.pdf/MD5E-s5077565--a2fb630d0a395c7b722eb561cfff230b.pdf
			
--- a/downloaded_data/UNFCCC/Niger/BUR1/RIN_BUR-2022_VF_11-07-2022_FINAL.pdf
+++ b/downloaded_data/UNFCCC/Niger/BUR1/RIN_BUR-2022_VF_11-07-2022_FINAL.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/6f/0J/MD5E-s4137398--9e680b56495af7b48ed59f8cb8d4655a.pdf/MD5E-s4137398--9e680b56495af7b48ed59f8cb8d4655a.pdf
			
--- a/downloaded_data/UNFCCC/North_Macedonia/NC4/EN%2C_IV_NCCC.pdf
+++ b/downloaded_data/UNFCCC/North_Macedonia/NC4/EN%2C_IV_NCCC.pdf
--- a/downloaded_data/UNFCCC/North_Macedonia/NIR/IV_Inventory_report.pdf
+++ b/downloaded_data/UNFCCC/North_Macedonia/NIR/IV_Inventory_report.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/1Z/qM/MD5E-s10485798--57047f5fcb4ac3892a5acefbdf20fded.pdf/MD5E-s10485798--57047f5fcb4ac3892a5acefbdf20fded.pdf
			
--- a/downloaded_data/UNFCCC/Somalia/BUR1/Somalia_First_BUR_report_2022.pdf
+++ b/downloaded_data/UNFCCC/Somalia/BUR1/Somalia_First_BUR_report_2022.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/fj/j8/MD5E-s4034845--018ffe4c39a6fd2a6f7b5de7551f42d0.pdf/MD5E-s4034845--018ffe4c39a6fd2a6f7b5de7551f42d0.pdf
			
--- a/downloaded_data/UNFCCC/Suriname/NC3/SURINAME_NC3_2023_FINAL.pdf
+++ b/downloaded_data/UNFCCC/Suriname/NC3/SURINAME_NC3_2023_FINAL.pdf
@@ -0,0 +1 @@
 
				+../../../../.git/annex/objects/FF/mm/MD5E-s10184040--43ba0348a79943ea2f4427f6b6e81812.pdf/MD5E-s10184040--43ba0348a79943ea2f4427f6b6e81812.pdf
			
--- a/downloaded_data/UNFCCC/submissions-bur.csv
+++ b/downloaded_data/UNFCCC/submissions-bur.csv
@@ -1 +1 @@
 
				-../../.git/annex/objects/9P/37/MD5E-s47375--6ca4662bb7e57d404617b66ed3028050.csv/MD5E-s47375--6ca4662bb7e57d404617b66ed3028050.csv
			
 
				+../../.git/annex/objects/Kp/jz/MD5E-s47947--27f4ac07d714bdca02d0187343ddb51b.csv/MD5E-s47947--27f4ac07d714bdca02d0187343ddb51b.csv
			
--- a/downloaded_data/UNFCCC/submissions-nc.csv
+++ b/downloaded_data/UNFCCC/submissions-nc.csv
@@ -1 +1 @@
 
				-../../.git/annex/objects/FX/QJ/MD5E-s79887--1e9d67b4a0ea171590ffe9c8ac09465d.csv/MD5E-s79887--1e9d67b4a0ea171590ffe9c8ac09465d.csv
			
 
				+../../.git/annex/objects/xj/z0/MD5E-s80513--bc265a60b581084f38e4e5a2284d026a.csv/MD5E-s80513--bc265a60b581084f38e4e5a2284d026a.csv
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.csv
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.nc
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.yaml
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/Gg/2X/MD5E-s18919--a42a1fee0e4c3c610714557a75d1620e.csv/MD5E-s18919--a42a1fee0e4c3c610714557a75d1620e.csv
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/pQ/86/MD5E-s114204--457a97cd84140839af57080e201d1551.nc/MD5E-s114204--457a97cd84140839af57080e201d1551.nc
			
--- a/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml
+++ b/extracted_data/UNFCCC/Afghanistan/AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.yaml
@@ -0,0 +1,31 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  title: Data submitted to the UNFCCC by country Afghanistan as available in the DI
			
 
				+    interface on 2023-05-24.
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - source
			
 
				+  - measure
			
 
				+  - class
			
 
				+  - scenario (Access_Date)
			
 
				+  - area (ISO3)
			
 
				+  - provenance
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: AFG_DI_4f8be4fe6093240f111a4861566443fb_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.csv
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.nc
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.yaml
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/Pq/v4/MD5E-s85461--155b0c24d8922433f5b988c6854f9b35.csv/MD5E-s85461--155b0c24d8922433f5b988c6854f9b35.csv
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/Gk/FK/MD5E-s245028--6d75bea846c53fc0d8fa41c553bb4ca7.nc/MD5E-s245028--6d75bea846c53fc0d8fa41c553bb4ca7.nc
			
--- a/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml
+++ b/extracted_data/UNFCCC/Albania/ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.yaml
@@ -0,0 +1,31 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  title: Data submitted to the UNFCCC by country Albania as available in the DI interface
			
 
				+    on 2023-05-24.
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - source
			
 
				+  - measure
			
 
				+  - class
			
 
				+  - scenario (Access_Date)
			
 
				+  - area (ISO3)
			
 
				+  - provenance
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: ALB_DI_81886afef7c571b60699a44198be0042_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/j6/v7/MD5E-s33876--617014378d61f3ba6fc2e5ce0d1b95d8.csv/MD5E-s33876--617014378d61f3ba6fc2e5ce0d1b95d8.csv
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/g2/Vm/MD5E-s142168--86942819046fe2819092c4679f870d37.nc/MD5E-s142168--86942819046fe2819092c4679f870d37.nc
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml
@@ -0,0 +1,31 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  title: Data submitted to the UNFCCC by country Algeria as available in the DI interface
			
 
				+    on 2023-05-24.
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - source
			
 
				+  - measure
			
 
				+  - class
			
 
				+  - scenario (Access_Date)
			
 
				+  - area (ISO3)
			
 
				+  - provenance
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.csv
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.nc
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.nc
			
--- a/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.yaml
+++ b/extracted_data/UNFCCC/Algeria/DZA_DI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+DZA_DI_1379ca063b21fcfd4914106a4a4b3f3e_raw_hash.yaml
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.csv
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.csv
@@ -0,0 +1 @@
 
				+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.nc
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.nc
@@ -0,0 +1 @@
 
				+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.yaml
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_2023-05-24_raw.yaml
@@ -0,0 +1 @@
 
				+AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/g7/pX/MD5E-s10020--fe0830fefd7d17c22e2bdcd904eac64c.csv/MD5E-s10020--fe0830fefd7d17c22e2bdcd904eac64c.csv
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.nc
@@ -0,0 +1 @@
 
				+../../../.git/annex/objects/km/k4/MD5E-s102299--8f7f0b75284302c6ac533ded03fec0b7.nc/MD5E-s102299--8f7f0b75284302c6ac533ded03fec0b7.nc
			
--- a/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml
+++ b/extracted_data/UNFCCC/Angola/AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.yaml
@@ -0,0 +1,31 @@
 
				+attrs:
			
 
				+  references: https://di.unfccc.int
			
 
				+  title: Data submitted to the UNFCCC by country Angola as available in the DI interface
			
 
				+    on 2023-05-24.
			
 
				+  comment: Data read from the UNFCCC DI flexible query interface using the API. Data
			
 
				+    read on 2023-05-24.
			
 
				+  rights: ''
			
 
				+  contact: mail@johannes-guetschow.de
			
 
				+  institution: United Nations Framework Convention on Climate Change (www.unfccc.int)
			
 
				+  cat: category (BURDI)
			
 
				+  area: area (ISO3)
			
 
				+  scen: scenario (Access_Date)
			
 
				+  sec_cats:
			
 
				+  - class
			
 
				+  - measure
			
 
				+time_format: '%Y'
			
 
				+dimensions:
			
 
				+  '*':
			
 
				+  - time
			
 
				+  - source
			
 
				+  - measure
			
 
				+  - class
			
 
				+  - scenario (Access_Date)
			
 
				+  - area (ISO3)
			
 
				+  - provenance
			
 
				+  - category (BURDI)
			
 
				+  - entity
			
 
				+  - unit
			
 
				+additional_coordinates:
			
 
				+  orig_cat_name: category (BURDI)
			
 
				+data_file: AGO_DI_88f688f9a6b8a1c4cf2d8132d945e59b_raw_hash.csv
		`@@ -0,0 +1 @@`
		`+.git/annex/objects/75/Pv/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia/MD5E-s4431--8911139e2988aae3466a7b67ae6278a4.dia`
		`@@ -0,0 +1 @@`
		`+DI_AnnexI_8e37fff44242048f4872729a074fe7c8_raw_hash.csv`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/21/Q3/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv/MD5E-s8633920--e2717f06866dfa55b69dbbe89ec8016c.csv`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/Xk/7m/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc/MD5E-s124012496--7381c2c8b4fc0f0ee7822227b82897b7.nc`
		`@@ -0,0 +1 @@`
		`+DI_non_AnnexI_9207438c486a8309418a9137dec85c07_raw_hash.csv`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/kJ/1w/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv/MD5E-s6365320--23bc20509afe3fee9f8dec1c64e1a1fe.csv`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/p0/9q/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc/MD5E-s3871067--1e1932a6024f71dd6033f2464b63c439.nc`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/Pg/q5/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv/MD5E-s170333--c9d15c83610e449344b11d7313d577e0.csv`
		`@@ -0,0 +1 @@`
		`+../../../.git/annex/objects/x0/z5/MD5E-s250365--f718595d2232f23579164612e71192c7.nc/MD5E-s250365--f718595d2232f23579164612e71192c7.nc`
		`@@ -0,0 +1 @@`
		`+../../.git/annex/objects/xK/JK/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv/MD5E-s902--fdd2551e2c7624333bed28a18c1bb2be.csv`