Parcourir la source

restructure repository; add function to read all DI data

Johannes Gütschow il y a 1 an
Parent
commit
532334647f
34 fichiers modifiés avec 839 ajouts et 747 suppressions
  1. 2 12
      UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py
  2. 6 24
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
  3. 1 3
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
  4. 7 11
      UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
  5. 0 20
      UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py
  6. 202 21
      UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py
  7. 4 1
      UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py
  8. 0 75
      UNFCCC_GHG_data/UNFCCC_DI_reader/util.py
  9. 12 11
      UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py
  10. 7 9
      UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py
  11. 8 9
      UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py
  12. 2 3
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py
  13. 2 3
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py
  14. 3 4
      UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py
  15. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py
  16. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py
  17. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py
  18. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py
  19. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py
  20. 2 8
      UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py
  21. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py
  22. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py
  23. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py
  24. 1 8
      UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py
  25. 1 6
      UNFCCC_GHG_data/UNFCCC_reader/__init__.py
  26. 3 437
      UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py
  27. 3 9
      UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py
  28. 6 1
      UNFCCC_GHG_data/__init__.py
  29. 24 0
      UNFCCC_GHG_data/helper/__init__.py
  30. 2 2
      UNFCCC_GHG_data/helper/country_info.py
  31. 49 0
      UNFCCC_GHG_data/helper/definitions.py
  32. 1 1
      UNFCCC_GHG_data/helper/folder_mapping.py
  33. 445 0
      UNFCCC_GHG_data/helper/functions.py
  34. 38 11
      dodo.py

+ 2 - 12
UNFCCC_GHG_data/UNFCCC_CRF_reader/CRF_raw_for_year.py

@@ -9,20 +9,10 @@ submission are available in the downloaded data folder.
 # TODO: integrate into doit
 # TODO: integrate into doit
 
 
 import argparse
 import argparse
-import sys
 import primap2 as pm2
 import primap2 as pm2
 from pathlib import Path
 from pathlib import Path
 from datetime import date
 from datetime import date
-
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-#log_path = root_path / "log"
-code_path = root_path / "UNFCCC_GHG_data"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
-dataset_path = root_path / "datasets" / "UNFCCC"
-
-#sys.path.append(code_path.name)
+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
 
 
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.util import all_crf_countries
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
 from UNFCCC_GHG_data.UNFCCC_CRF_reader.UNFCCC_CRF_reader_prod import get_input_and_output_files_for_country
@@ -81,7 +71,7 @@ for country in all_crf_countries:
 today = date.today()
 today = date.today()
 
 
 compression = dict(zlib=True, complevel=9)
 compression = dict(zlib=True, complevel=9)
-output_folder = dataset_path / f"CRF{submission_year}"
+output_folder = dataset_path_UNFCCC / f"CRF{submission_year}"
 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
 output_filename = f"CRF{submission_year}_raw_{today.strftime('%Y-%m-%d')}"
 
 
 if not output_folder.exists():
 if not output_folder.exists():

+ 6 - 24
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py

@@ -8,9 +8,7 @@ import re
 import json
 import json
 import numpy as np
 import numpy as np
 import pandas as pd
 import pandas as pd
-import xarray as xr
 import primap2 as pm2
 import primap2 as pm2
-import pycountry
 from pathlib import Path
 from pathlib import Path
 from treelib import Tree
 from treelib import Tree
 from operator import itemgetter
 from operator import itemgetter
@@ -18,8 +16,8 @@ from collections import Counter
 from typing import Dict, List, Optional, Tuple, Union
 from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from . import crf_specifications as crf
 from . import crf_specifications as crf
-from .util import downloaded_data_path, NoCRFFilesError, custom_country_mapping
-
+from .util import NoCRFFilesError
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 ### reading functions
 ### reading functions
 def convert_crf_table_to_pm2if(
 def convert_crf_table_to_pm2if(
@@ -568,7 +566,7 @@ def get_crf_files(
     # we should only have files for one country and submission in the folder. But the
     # we should only have files for one country and submission in the folder. But the
     # function can also be used on a given folder and then the filter is useful.
     # function can also be used on a given folder and then the filter is useful.
     if folder is None:
     if folder is None:
-        data_folder = downloaded_data_path
+        data_folder = downloaded_data_path_UNFCCC
         submission_folder = f"CRF{submission_year}"
         submission_folder = f"CRF{submission_year}"
 
 
         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
         with open(data_folder / "folder_mapping.json", "r") as mapping_file:
@@ -935,7 +933,7 @@ def get_latest_date_for_country(
         str: string with date
         str: string with date
     """
     """
 
 
-    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
+    with open(downloaded_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
         folder_mapping = json.load(mapping_file)
         folder_mapping = json.load(mapping_file)
 
 
     if country_code in folder_mapping:
     if country_code in folder_mapping:
@@ -946,12 +944,12 @@ def get_latest_date_for_country(
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
             submission_date = find_latest_date(get_submission_dates(
             submission_date = find_latest_date(get_submission_dates(
-                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
+                downloaded_data_path_UNFCCC / country_folders / f"CRF{submission_year}", file_filter))
         else:
         else:
             dates = []
             dates = []
             for folder in country_folders:
             for folder in country_folders:
                 dates = dates + get_submission_dates(
                 dates = dates + get_submission_dates(
-                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
+                    downloaded_data_path_UNFCCC / folder / f"CRF{submission_year}", file_filter)
             submission_date = find_latest_date(dates)
             submission_date = find_latest_date(dates)
     else:
     else:
         raise ValueError(f"No data folder found for country {country_code}. "
         raise ValueError(f"No data folder found for country {country_code}. "
@@ -1059,19 +1057,3 @@ def find_latest_date(
 
 
     return dates_datetime[-1][0]
     return dates_datetime[-1][0]
 
 
-
-def get_country_name(
-        country_code: str,
-) -> str:
-    """get country name from UNFCCC_GHG_data """
-    if country_code in custom_country_mapping:
-        country_name = custom_country_mapping[country_code]
-    else:
-        try:
-            country = pycountry.countries.get(alpha_3=country_code)
-            country_name = country.name
-        except:
-            raise ValueError(f"Country UNFCCC_GHG_data {country_code} can not be mapped to "
-                             f"any country")
-
-    return country_name

+ 1 - 3
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py

@@ -13,11 +13,9 @@ from typing import List, Optional
 from pathlib import Path
 from pathlib import Path
 from datetime import date
 from datetime import date
 
 
-
 from .util import all_crf_countries
 from .util import all_crf_countries
-from .util import log_path
+from UNFCCC_GHG_data.helper import log_path, get_country_name
 from . import crf_specifications as crf
 from . import crf_specifications as crf
-from .UNFCCC_CRF_reader_core import get_country_name
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country, read_crf_table
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 
 

+ 7 - 11
UNFCCC_GHG_data/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py

@@ -20,18 +20,16 @@ from .UNFCCC_CRF_reader_core import read_crf_table
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
 from .UNFCCC_CRF_reader_core import get_latest_date_for_country
 from .UNFCCC_CRF_reader_core import get_crf_files
 from .UNFCCC_CRF_reader_core import get_crf_files
-from .UNFCCC_CRF_reader_core import get_country_name
 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
 from .UNFCCC_CRF_reader_devel import save_last_row_info
 from .UNFCCC_CRF_reader_devel import save_last_row_info
 
 
-from .util import code_path, log_path, \
-    custom_country_mapping, extracted_data_path, root_path, \
-    all_crf_countries, NoCRFFilesError
+from UNFCCC_GHG_data.helper import code_path, log_path, root_path
+from UNFCCC_GHG_data.helper import custom_country_mapping, extracted_data_path_UNFCCC
+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
+from .util import all_crf_countries, NoCRFFilesError
 
 
 #import sys
 #import sys
 #sys.path.append(code_path.name)
 #sys.path.append(code_path.name)
-from ..UNFCCC_reader import get_country_code
-
 
 
 # functions:
 # functions:
 # * testing fucntions
 # * testing fucntions
@@ -42,8 +40,6 @@ from ..UNFCCC_reader import get_country_code
 
 
 # TODO: add function to read several / all countries
 # TODO: add function to read several / all countries
 
 
-
-
 # general approach:
 # general approach:
 # main UNFCCC_GHG_data in a function that reads on table from one file.
 # main UNFCCC_GHG_data in a function that reads on table from one file.
 # return raw pandas DF for use in different functions
 # return raw pandas DF for use in different functions
@@ -188,7 +184,7 @@ def read_crf_for_country(
 
 
         if save_data:
         if save_data:
             compression = dict(zlib=True, complevel=9)
             compression = dict(zlib=True, complevel=9)
-            output_folder = extracted_data_path / country_name.replace(" ", "_")
+            output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
             output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
 
 
             if not output_folder.exists():
             if not output_folder.exists():
@@ -476,7 +472,7 @@ def get_input_and_output_files_for_country(
     country_info["input"] = input_files
     country_info["input"] = input_files
 
 
     # get output file
     # get output file
-    output_folder = extracted_data_path / country_name.replace(" ", "_")
+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
     output_files = [output_folder / f"{country_code}_CRF{submission_year}"
                                     f"_{submission_date}.{suffix}" for suffix
                                     f"_{submission_date}.{suffix}" for suffix
                     in ['yaml', 'csv', 'nc']]
                     in ['yaml', 'csv', 'nc']]
@@ -505,7 +501,7 @@ def submission_has_been_read(
     """
     """
     Check if a CRF submission has already been read
     Check if a CRF submission has already been read
     """
     """
-    output_folder = extracted_data_path / country_name.replace(" ", "_")
+    output_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
     output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
     if output_folder.exists():
     if output_folder.exists():
         existing_files = output_folder.glob(f"{output_filename}.*")
         existing_files = output_folder.glob(f"{output_filename}.*")

+ 0 - 20
UNFCCC_GHG_data/UNFCCC_CRF_reader/util.py

@@ -1,23 +1,3 @@
-from pathlib import Path
-
-# 4 for use from nbs, fix
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-log_path = root_path / "log"
-code_path = root_path / "UNFCCC_GHG_data"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
-
-# TODO: move this to a more general location as we can't import it
-# to get_submissions_info
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
-}
 
 
 all_crf_countries = [
 all_crf_countries = [
     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
     'AUS', 'AUT', 'BEL', 'BGR', 'BLR',

+ 202 - 21
UNFCCC_GHG_data/UNFCCC_DI_reader/UNFCCC_DI_reader_core.py

@@ -24,11 +24,14 @@ from .UNFCCC_DI_reader_config import di_query_filters
 from .UNFCCC_DI_reader_config import di_processing_info
 from .UNFCCC_DI_reader_config import di_processing_info
 from .UNFCCC_DI_reader_config import cat_conversion
 from .UNFCCC_DI_reader_config import cat_conversion
 from .UNFCCC_DI_reader_config import gas_baskets
 from .UNFCCC_DI_reader_config import gas_baskets
-from .util import NoDIDataError, get_country_name, get_country_code
-from .util import nAI_countries, AI_countries, custom_country_mapping
-from .util import code_path, root_path, extracted_data_path
+from .util import NoDIDataError, nAI_countries, AI_countries
 from .util import DI_date_format, regex_date
 from .util import DI_date_format, regex_date
 
 
+from UNFCCC_GHG_data.helper import custom_country_mapping
+from UNFCCC_GHG_data.helper import get_country_code, get_country_name
+from UNFCCC_GHG_data.helper import extracted_data_path_UNFCCC, root_path, code_path
+from UNFCCC_GHG_data.helper import dataset_path_UNFCCC
+
 
 
 def read_UNFCCC_DI_for_country(
 def read_UNFCCC_DI_for_country(
         country_code: str,
         country_code: str,
@@ -612,7 +615,8 @@ def convert_DI_data_to_pm2_if(
     if date_str == "country":
     if date_str == "country":
         pm2if_specifications["coords_defaults"]["scenario"] = f"DIrolling"
         pm2if_specifications["coords_defaults"]["scenario"] = f"DIrolling"
     elif date_str is None:
     elif date_str is None:
-        date_str = str(date.today())
+        today = date.today()
+        date_str = today.strftime(DI_date_format)
     pm2if_specifications["coords_defaults"]["scenario"] = f"DI{date_str}"
     pm2if_specifications["coords_defaults"]["scenario"] = f"DI{date_str}"
 
 
     # set metadata
     # set metadata
@@ -737,7 +741,7 @@ def save_DI_country_data(
 
 
     # get the filename with the hash and check if it exists (separate for pm2 format
     # get the filename with the hash and check if it exists (separate for pm2 format
     # and IF to fix broken datasets if necessary)
     # and IF to fix broken datasets if necessary)
-    filename_hash = determine_filename(country_code, token, raw, hash=True)
+    filename_hash = root_path / determine_filename(country_code, token, raw, hash=True)
 
 
     # primap2 native format
     # primap2 native format
     filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
     filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
@@ -761,7 +765,79 @@ def save_DI_country_data(
         print(f"Data unchanged for {country_code}. Create symlinks.")
         print(f"Data unchanged for {country_code}. Create symlinks.")
 
 
     # get the filename with the date
     # get the filename with the date
-    filename_date = determine_filename(country_code, date_str, raw)
+    filename_date = root_path / determine_filename(country_code, date_str, raw)
+
+    # create the symlinks to the actual data (with the hash)
+    suffixes = ['.nc', '.csv', '.yaml']
+    for suffix in suffixes:
+        file_date = filename_date.parent / (filename_date.name + suffix)
+        file_hash = filename_hash.name + suffix
+        if file_date.exists():
+            file_date.unlink()
+        file_date.symlink_to(file_hash)
+
+
+def save_DI_dataset(
+        data_pm2: xr.Dataset,
+        raw: bool=True,
+        non_AnnexI: bool=True,
+):
+    '''
+    save primap2 and IF data to dataset folder
+    can be used for raw and processed data but not to save to country folders
+    '''
+
+    # preparations
+    data_if = data_pm2.pr.to_interchange_format()
+    if non_AnnexI:
+        country_group = "non-AnnexI"
+    else:
+        country_group = "AnnexI"
+
+    ## get timestamp
+    scenario_col = data_pm2.attrs['scen']
+    scenarios = data_if[scenario_col].unique()
+    if len(scenarios) > 1:
+        raise ValueError(f"More than one scenario in input data. This function can only"
+                         f"handle single scenario data. Scenarios: {scenarios}")
+    else:
+        scenario = scenarios[0]
+
+    date_str = scenario[2:]
+
+    # calculate the hash of the data to see if it's identical to present data
+    data_for_token = data_if.drop(columns=[scenario_col])
+    token = tokenize(data_for_token)
+
+    # get the filename with the hash and check if it exists (separate for pm2 format
+    # and IF to fix broken datasets if necessary)
+    filename_hash = determine_dataset_filename(token, raw, non_AnnexI=non_AnnexI,
+                                               hash=True)
+    # primap2 native format
+    filename_hash_nc = filename_hash.parent / (filename_hash.name + '.nc')
+    if not filename_hash_nc.exists():
+        # if parent dir does not exist create it
+        # TODO double, also in determine_dataset_filename. same for country data
+        if not filename_hash.parent.exists():
+            filename_hash.parent.mkdir()
+        # save the data
+        print(f"Data has changed. Save to {filename_hash_nc.name}")
+        compression = dict(zlib=True, complevel=9)
+        encoding = {var: compression for var in data_pm2.data_vars}
+        data_pm2.pr.to_netcdf(filename_hash_nc, encoding=encoding)
+
+    # primap2 IF
+    filename_hash_csv = filename_hash.parent / (filename_hash.name + '.csv')
+    if not filename_hash_csv.exists():
+        # save the data
+        print(f"Data has changed. Save to {filename_hash.name + '.csv/.yaml'}")
+        pm2.pm2io.write_interchange_format(filename_hash, data_if)
+    else:
+        print(f"Data unchanged for {country_group}. Create symlinks.")
+
+    # get the filename with the date
+    filename_date = determine_dataset_filename(date_str, raw=raw,
+                                               non_AnnexI=non_AnnexI, hash=False)
 
 
     # create the symlinks to the actual data (with the hash)
     # create the symlinks to the actual data (with the hash)
     suffixes = ['.nc', '.csv', '.yaml']
     suffixes = ['.nc', '.csv', '.yaml']
@@ -773,6 +849,59 @@ def save_DI_country_data(
         file_date.symlink_to(file_hash)
         file_date.symlink_to(file_hash)
 
 
 
 
+## functions for multiple country reading
+def read_UNFCCC_DI_for_all_countries(
+        non_AnnexI: bool=True,
+) -> xr.Dataset:
+    '''
+    This function reads DI data for all countries in a group (annexI or non-AnnexI)
+    TODO: currently only non-annexI is implemented
+    The function reads all data in one go using datalad run. as the output data file
+    names are unknown beforehand datalad run uses explicit=false
+    TODO: decide if dataset creation goes in here as well. Makes sense, I think. Then
+    the function can return the xarray dataset
+    '''
+
+    today = date.today()
+    date_str = today.strftime(DI_date_format)
+
+    if non_AnnexI:
+        countries = nAI_countries
+    else:
+        raise ValueError("Bulk reading for AnnexI countries not implemented yet")
+
+    # read the data
+    data_all = None
+    for country in countries[0:5]:
+        print(f"reading DI data for country {country}")
+
+        try:
+            data_country = read_UNFCCC_DI_for_country(
+                country_code=country,
+                category_groups=None,  # read all categories
+                read_subsectors=False,  # not applicable as we read all categories
+                date_str=date_str,
+                pm2if_specifications=None,
+                # automatically use the right specs for AI and NAI
+                default_gwp=None,  # automatically uses right default GWP for AI and NAI
+                debug=False)
+
+            if data_all is None:
+                data_all = data_country
+            else:
+                data_all = data_all.pr.merge(data_country)
+        except unfccc_di_api.NoDataError as err:
+            print(f"No data for {country}.")
+            print(err)
+
+    # TODO: write metadata
+
+    # save the data
+    #save_DI_dataset(data_all, raw=True, non_AnnexI=non_AnnexI)
+
+    return data_all
+
+
 ## datalad and pydoit interface functions
 ## datalad and pydoit interface functions
 def read_DI_for_country_datalad(
 def read_DI_for_country_datalad(
         country: str,
         country: str,
@@ -790,7 +919,8 @@ def read_DI_for_country_datalad(
     """
     """
 
 
     # get date to determine output filename
     # get date to determine output filename
-    date_str = str(date.today())
+    today = date.today()
+    date_str = today.strftime(DI_date_format)
 
 
     # get all the info for the country
     # get all the info for the country
     country_info = get_input_and_output_files_for_country_DI(country, date_str,
     country_info = get_input_and_output_files_for_country_DI(country, date_str,
@@ -815,7 +945,7 @@ def read_DI_for_country_datalad(
             inputs=country_info["input"],
             inputs=country_info["input"],
             outputs=country_info["output"],
             outputs=country_info["output"],
             dry_run=None,
             dry_run=None,
-            explicit=True,
+            explicit=False,
         )
         )
     except IncompleteResultsError as IRE:
     except IncompleteResultsError as IRE:
         print(f"IncompleteResultsError occured when running {cmd}: {IRE}")
         print(f"IncompleteResultsError occured when running {cmd}: {IRE}")
@@ -865,7 +995,7 @@ def process_DI_for_country_datalad(
             inputs=country_info["input"],
             inputs=country_info["input"],
             outputs=country_info["output"],
             outputs=country_info["output"],
             dry_run=None,
             dry_run=None,
-            explicit=True,
+            explicit=False,
         )
         )
     except IncompleteResultsError as IRE:
     except IncompleteResultsError as IRE:
         print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
         print(f"IncompleteResultsError occurred when running {cmd}: {IRE}")
@@ -874,15 +1004,21 @@ def process_DI_for_country_datalad(
         print(ex.message)
         print(ex.message)
 
 
 
 
-## helper functions
-
+def read_DI_for_all_countries_datalad(
+        non_AnnexI: bool=True,
+):
+    '''
+    This function calls datalad run to read all data in one go. as the output data file
+    names are unknown beforehand datalad run uses explicit=false
+    '''
 
 
+## helper functions
 def determine_filename(
 def determine_filename(
         country_code: str,
         country_code: str,
         date_or_hash: str,
         date_or_hash: str,
         raw: bool=False,
         raw: bool=False,
         hash: bool=False,
         hash: bool=False,
-)->Path:
+) -> Path:
     """
     """
     Determine the filename for a dataset from given country code and date string.
     Determine the filename for a dataset from given country code and date string.
 
 
@@ -891,10 +1027,11 @@ def determine_filename(
     ----------
     ----------
     country_code: str
     country_code: str
         ISO 3 letter code of the country
         ISO 3 letter code of the country
-    date_str:
+    date_or_hash:
         formatted date string
         formatted date string
-    raw:
+    raw: bool
         bool specifying if filename fow raw or processed data should be returned
         bool specifying if filename fow raw or processed data should be returned
+    hash: str
 
 
     Returns
     Returns
     _______
     _______
@@ -903,7 +1040,7 @@ def determine_filename(
     """
     """
 
 
     # get the country folder
     # get the country folder
-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
         folder_mapping = json.load(mapping_file)
         folder_mapping = json.load(mapping_file)
 
 
     if country_code in folder_mapping:
     if country_code in folder_mapping:
@@ -912,14 +1049,14 @@ def determine_filename(
         country_folders = folder_mapping[country_code]
         country_folders = folder_mapping[country_code]
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
-            country_folder = extracted_data_path / country_folders
+            country_folder = extracted_data_path_UNFCCC / country_folders
         else:
         else:
             raise ValueError("More than one output folder for country "
             raise ValueError("More than one output folder for country "
                              f"{country_code}. This should not happen.")
                              f"{country_code}. This should not happen.")
     else:
     else:
         # folder not in mapping. It will be created if not present yet
         # folder not in mapping. It will be created if not present yet
         country_name = get_country_name(country_code)
         country_name = get_country_name(country_code)
-        country_folder = extracted_data_path / country_name.replace(" ", "_")
+        country_folder = extracted_data_path_UNFCCC / country_name.replace(" ", "_")
 
 
         if country_folder.exists():
         if country_folder.exists():
            print(f"Output folder {country_name.replace(' ', '_')} for country "
            print(f"Output folder {country_name.replace(' ', '_')} for country "
@@ -938,6 +1075,50 @@ def determine_filename(
     return filename.relative_to(root_path)
     return filename.relative_to(root_path)
 
 
 
 
+def determine_dataset_filename(
+        date_or_hash: str,
+        raw: bool=False,
+        non_AnnexI: bool=True,
+        hash: bool = False,
+) -> Path:
+    """
+    Determine the filename for a dataset from given country group and date string.
+
+    Parameters
+    ----------
+    date_or_hash:
+        formatted date string
+    raw: bool
+        bool specifying if filename fow raw or processed data should be returned
+    non_AnnexI: bool
+        True if non-AnnexI False if AnnexI
+    hash: str
+
+    Returns
+    _______
+        pathlib Path object for the file name (without suffix)
+    """
+
+    # get the country folder
+    if non_AnnexI:
+        current_dataset_path = dataset_path_UNFCCC / "DI_non_AnnexI"
+        filename = f"DI_non_AnnexI_{date_or_hash}"
+    else:
+        current_dataset_path = dataset_path_UNFCCC / "DI_AnnexI"
+        filename = f"DI_AnnexI_{date_or_hash}"
+
+    if not current_dataset_path.exists():
+        current_dataset_path.mkdir()
+
+    if raw:
+        filename = f"{filename}_raw"
+    if hash:
+        filename = f"{filename}_hash"
+    filename = current_dataset_path / filename
+
+    return filename.relative_to(root_path)
+
+
 def convert_categories(
 def convert_categories(
         ds_input: xr.Dataset,
         ds_input: xr.Dataset,
         conversion: Dict[str, Dict[str, str]],
         conversion: Dict[str, Dict[str, str]],
@@ -1090,7 +1271,7 @@ def get_present_hashes_for_country_DI(
         regex_hash = regex_hash + "hash\.nc"
         regex_hash = regex_hash + "hash\.nc"
 
 
     # get the country folder
     # get the country folder
-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
         folder_mapping = json.load(mapping_file)
         folder_mapping = json.load(mapping_file)
 
 
     if country_code in folder_mapping:
     if country_code in folder_mapping:
@@ -1099,7 +1280,7 @@ def get_present_hashes_for_country_DI(
         country_folders = folder_mapping[country_code]
         country_folders = folder_mapping[country_code]
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
-            country_folder = extracted_data_path / country_folders
+            country_folder = extracted_data_path_UNFCCC / country_folders
         else:
         else:
             raise ValueError("More than one output folder for country "
             raise ValueError("More than one output folder for country "
                              f"{country_code}. This should not happen.")
                              f"{country_code}. This should not happen.")
@@ -1135,7 +1316,7 @@ def find_latest_DI_data(
         regex = regex_date + r"\.nc"
         regex = regex_date + r"\.nc"
 
 
     # get the country folder
     # get the country folder
-    with open(extracted_data_path / "folder_mapping.json", "r") as mapping_file:
+    with open(extracted_data_path_UNFCCC / "folder_mapping.json", "r") as mapping_file:
         folder_mapping = json.load(mapping_file)
         folder_mapping = json.load(mapping_file)
 
 
     if country_code in folder_mapping:
     if country_code in folder_mapping:
@@ -1144,7 +1325,7 @@ def find_latest_DI_data(
         country_folders = folder_mapping[country_code]
         country_folders = folder_mapping[country_code]
         if isinstance(country_folders, str):
         if isinstance(country_folders, str):
             # only one folder
             # only one folder
-            country_folder = extracted_data_path / country_folders
+            country_folder = extracted_data_path_UNFCCC / country_folders
         else:
         else:
             raise ValueError("More than one output folder for country "
             raise ValueError("More than one output folder for country "
                              f"{country_code}. This should not happen.")
                              f"{country_code}. This should not happen.")

+ 4 - 1
UNFCCC_GHG_data/UNFCCC_DI_reader/__init__.py

@@ -5,7 +5,9 @@ from .UNFCCC_DI_reader_core import \
     read_UNFCCC_DI_for_country, read_DI_for_country_datalad, \
     read_UNFCCC_DI_for_country, read_DI_for_country_datalad, \
     process_UNFCCC_DI_for_country, process_and_save_UNFCCC_DI_for_country, \
     process_UNFCCC_DI_for_country, process_and_save_UNFCCC_DI_for_country, \
     process_DI_for_country_datalad, \
     process_DI_for_country_datalad, \
-    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename
+    convert_DI_data_to_pm2_if, convert_DI_IF_data_to_pm2, determine_filename, \
+    read_UNFCCC_DI_for_all_countries
+
 
 
 
 
 __all__ = [
 __all__ = [
@@ -17,4 +19,5 @@ __all__ = [
     "convert_DI_data_to_pm2_if",
     "convert_DI_data_to_pm2_if",
     "convert_DI_IF_data_to_pm2",
     "convert_DI_IF_data_to_pm2",
     "determine_filename",
     "determine_filename",
+    "read_UNFCCC_DI_for_all_countries",
 ]
 ]

+ 0 - 75
UNFCCC_GHG_data/UNFCCC_DI_reader/util.py

@@ -1,17 +1,6 @@
-from pathlib import Path
 import unfccc_di_api
 import unfccc_di_api
-# imports for copied functions
-import pycountry
-
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-log_path = root_path / "log"
-code_path = root_path / "UNFCCC_GHG_data"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / "extracted_data" / "UNFCCC"
 
 
 reader = unfccc_di_api.UNFCCCApiReader()
 reader = unfccc_di_api.UNFCCCApiReader()
-
 nAI_countries = list(reader.non_annex_one_reader.parties["code"])
 nAI_countries = list(reader.non_annex_one_reader.parties["code"])
 AI_countries = list(reader.annex_one_reader.parties["code"])
 AI_countries = list(reader.annex_one_reader.parties["code"])
 
 
@@ -22,67 +11,3 @@ class NoDIDataError(Exception):
     pass
     pass
 
 
 
 
-# the following is copied from other sub-packages
-# TODO: move these functions to common location to allow easy importing into all modules
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
-}
-
-
-def get_country_name(
-        country_code: str,
-) -> str:
-    """get country name from code """
-    if country_code in custom_country_mapping:
-        country_name = custom_country_mapping[country_code]
-    else:
-        try:
-            country = pycountry.countries.get(alpha_3=country_code)
-            country_name = country.name
-        except:
-            raise ValueError(f"Country code {country_code} can not be mapped to "
-                             f"any country")
-
-    return country_name
-
-
-def get_country_code(
-        country_name: str,
-)->str:
-    """
-    obtain country code. If the input is a code it will be returned, if the input
-    is not a three letter code a search will be performed
-
-    Parameters
-    __________
-    country_name: str
-        Country code or name to get the three-letter code for.
-
-    """
-    try:
-        # check if it's a 3 letter code
-        country = pycountry.countries.get(alpha_3=country_name)
-        country_code = country.alpha_3
-    except:
-        try:
-            country = pycountry.countries.search_fuzzy(country_name)
-        except:
-            raise ValueError(f"Country name {country_name} can not be mapped to "
-                             f"any country code")
-        if len(country) > 1:
-            country_code = None
-            for current_country in country:
-                if current_country.name == country_name:
-                    country_code = current_country.alpha_3
-            if country_code is None:
-                raise ValueError(f"Country name {country_name} has {len(country)} "
-                                 f"possible results for country codes.")
-
-        country_code = country[0].alpha_3
-
-    return country_code

+ 12 - 11
UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py

@@ -11,7 +11,7 @@ from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from pathlib import Path
 from pathlib import Path
 
 
-root = Path(__file__).parents[2]
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
 
 
 ###############
 ###############
 #
 #
@@ -77,12 +77,10 @@ else:
         "submissions/national-inventory-submissions-{}".format(year)
         "submissions/national-inventory-submissions-{}".format(year)
     )
     )
 
 
-download_path = root / "downloaded_data" / "UNFCCC"
-
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
 
 
 # Read submissions list
 # Read submissions list
-submissions = pd.read_csv(download_path / f"submissions-annexI_{year}.csv")
+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv")
 
 
 # filter submissions list or category
 # filter submissions list or category
 items = submissions[submissions.Kind  == category.upper()]
 items = submissions[submissions.Kind  == category.upper()]
@@ -120,7 +118,7 @@ for idx, submission in items.iterrows():
     country = country.replace(' ', '_')
     country = country.replace(' ', '_')
     print(f"Downloading {title} from {url}")
     print(f"Downloading {title} from {url}")
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = \
     local_filename = \
@@ -167,7 +165,7 @@ for idx, submission in items.iterrows():
             
             
         if local_filename.exists():
         if local_filename.exists():
             new_downloaded.append(submission)
             new_downloaded.append(submission)
-            print(f"Download => {local_filename.relative_to(root)}")
+            print(f"Download => {local_filename.relative_to(root_path)}")
             # unzip data (only for new downloads)
             # unzip data (only for new downloads)
             if local_filename.suffix == ".zip":
             if local_filename.suffix == ".zip":
                 try:
                 try:
@@ -177,18 +175,21 @@ for idx, submission in items.iterrows():
                     zipped_file.close()
                     zipped_file.close()
                 # TODO Better error logging/visibilty
                 # TODO Better error logging/visibilty
                 except zipfile.BadZipFile:
                 except zipfile.BadZipFile:
-                    print(f"Error while trying to extract {local_filename.relative_to(root)}")
+                    print(f"Error while trying to extract "
+                          f"{local_filename.relative_to(root_path)}")
                 except NotImplementedError:
                 except NotImplementedError:
                     print("Zip format not supported, please unzip on the command line.")
                     print("Zip format not supported, please unzip on the command line.")
             else:
             else:
-                print(f"Not attempting to extract {local_filename.relative_to(root)}.")
+                print(f"Not attempting to extract "
+                      f"{local_filename.relative_to(root_path)}.")
         else:
         else:
-            print(f"Failed to download {local_filename.relative_to(root)}")
+            print(f"Failed to download {local_filename.relative_to(root_path)}")
 
 
     else:
     else:
-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
 
 
 driver.close()
 driver.close()
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC
+          / f"00_new_downloads_{category}{year}-{date.today()}.csv", index=False)

+ 7 - 9
UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py

@@ -5,9 +5,9 @@ import time
 import os
 import os
 from datetime import date
 from datetime import date
 from random import randrange
 from random import randrange
-
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 from pathlib import Path
 from pathlib import Path
-root = Path(__file__).parents[2]
+
 """
 """
 based on download_bur from national-inventory-submissions
 based on download_bur from national-inventory-submissions
 # (https://github.com/openclimatedata/national-inventory-submisions)
 # (https://github.com/openclimatedata/national-inventory-submisions)
@@ -35,13 +35,11 @@ url = "https://www4.unfccc.int/sites/NDCStaging/Pages/All.aspx"
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
 
 
 # Ensure download path and subfolders exist
 # Ensure download path and subfolders exist
-download_path = root / "downloaded_data" / "UNFCCC"
-if not download_path.exists():
-    download_path.mkdir(parents=True)
+if not downloaded_data_path_UNFCCC.exists():
+    downloaded_data_path_UNFCCC.mkdir(parents=True)
 
 
 new_downloaded = []
 new_downloaded = []
 
 
-
 for idx, submission in submissions.iterrows():
 for idx, submission in submissions.iterrows():
     print("=" * 60)
     print("=" * 60)
     ndc = submission.Number
     ndc = submission.Number
@@ -54,12 +52,12 @@ for idx, submission in submissions.iterrows():
 
 
     ndc_folder = "NDC_" + ndc + "_" + submission_date
     ndc_folder = "NDC_" + ndc + "_" + submission_date
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = country_folder / ndc_folder / url.split('/')[-1]
     local_filename = country_folder / ndc_folder / url.split('/')[-1]
     local_filename_underscore = \
     local_filename_underscore = \
-        download_path / country / ndc_folder / \
+        downloaded_data_path_UNFCCC / country / ndc_folder / \
         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
         url.split('/')[-1].replace("%20", "_").replace(" ", "_")
     if not local_filename.parent.exists():
     if not local_filename.parent.exists():
         local_filename.parent.mkdir()
         local_filename.parent.mkdir()
@@ -102,4 +100,4 @@ for idx, submission in submissions.iterrows():
 
 
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "00_new_downloads_ndc-{}.csv".format(date.today()), index=False)

+ 8 - 9
UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py

@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from pathlib import Path
 from pathlib import Path
-
-root = Path(__file__).parents[2]
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path_UNFCCC
 
 
 ###############
 ###############
 #
 #
@@ -45,8 +44,7 @@ else:
 error_file_sizes = [212, 210]
 error_file_sizes = [212, 210]
 
 
 # Read submissions list
 # Read submissions list
-download_path = root / "downloaded_data" / "UNFCCC"
-submissions = pd.read_csv(download_path / f"submissions-{category.lower()}.csv")
+submissions = pd.read_csv(downloaded_data_path_UNFCCC / f"submissions-{category.lower()}.csv")
 
 
 # set options for headless mode
 # set options for headless mode
 profile_path = ".firefox"
 profile_path = ".firefox"
@@ -82,7 +80,7 @@ for idx, submission in submissions.iterrows():
     country = country.replace(' ', '_')
     country = country.replace(' ', '_')
     print(f"Downloading {title} from {url}")
     print(f"Downloading {title} from {url}")
 
 
-    country_folder = download_path / country
+    country_folder = downloaded_data_path_UNFCCC / country
     if not country_folder.exists():
     if not country_folder.exists():
         country_folder.mkdir()
         country_folder.mkdir()
     local_filename = \
     local_filename = \
@@ -129,14 +127,15 @@ for idx, submission in submissions.iterrows():
             
             
         if local_filename.exists():
         if local_filename.exists():
             new_downloaded.append(submission)
             new_downloaded.append(submission)
-            print(f"Download => {local_filename.relative_to(root)}")
+            print(f"Download => {local_filename.relative_to(root_path)}")
         else:
         else:
-            print(f"Failed to download {local_filename.relative_to(root)}")
+            print(f"Failed to download {local_filename.relative_to(root_path)}")
 
 
     else:
     else:
-        print(f"=> Already downloaded {local_filename.relative_to(root)}")
+        print(f"=> Already downloaded {local_filename.relative_to(root_path)}")
 
 
 driver.close()
 driver.close()
 
 
 df = pd.DataFrame(new_downloaded)
 df = pd.DataFrame(new_downloaded)
-df.to_csv(download_path / f"00_new_downloads_{category}-{date.today()}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC /
+          f"00_new_downloads_{category}-{date.today()}.csv", index=False)

+ 2 - 3
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py

@@ -8,8 +8,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from unfccc_submission_info import get_unfccc_submission_info
 from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 max_tries = 10
 max_tries = 10
 
 
@@ -143,4 +142,4 @@ if len(no_downloads) > 0:
 
 
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
-df.to_csv(root / "downloaded_data" / "UNFCCC" / f"submissions-annexI_{year}.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / f"submissions-annexI_{year}.csv", index=False)

+ 2 - 3
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py

@@ -9,8 +9,7 @@ from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
 from unfccc_submission_info import get_unfccc_submission_info
 from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 """
 """
 Download UNFCCC Biennial Update Report submissions
 Download UNFCCC Biennial Update Report submissions
@@ -84,4 +83,4 @@ if len(no_downloads) > 0:
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
 df = df[["Kind", "Country", "Title", "URL"]]
 df = df[["Kind", "Country", "Title", "URL"]]
-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-bur.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "submissions-bur.csv", index=False)

+ 3 - 4
UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py

@@ -8,9 +8,8 @@ from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox
 from selenium.webdriver import Firefox
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.options import Options
 from random import randrange
 from random import randrange
-from unfccc_submission_info import get_unfccc_submission_info
-
-root = Path(__file__).absolute().parents[2]
+from .unfccc_submission_info import get_unfccc_submission_info
+from UNFCCC_GHG_data.helper import downloaded_data_path_UNFCCC
 
 
 """
 """
 Download UNFCCC Biennial Update Report submissions
 Download UNFCCC Biennial Update Report submissions
@@ -85,4 +84,4 @@ if len(no_downloads) > 0:
 driver.close()
 driver.close()
 df = pd.DataFrame(downloads)
 df = pd.DataFrame(downloads)
 df = df[["Kind", "Country", "Title", "URL"]]
 df = df[["Kind", "Country", "Title", "URL"]]
-df.to_csv(root / "downloaded_data" / "UNFCCC" / "submissions-nc.csv", index=False)
+df.to_csv(downloaded_data_path_UNFCCC / "submissions-nc.csv", index=False)

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Argentina/read_ARG_BUR4_from_pdf.py

@@ -6,7 +6,7 @@ import sys
 import camelot
 import camelot
 import primap2 as pm2
 import primap2 as pm2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
 from primap2.pm2io._conversion import convert_ipcc_code_primap_to_primap2
-from pathlib import Path
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
@@ -16,11 +16,6 @@ from pathlib import Path
 #  PRIMAP2 version
 #  PRIMAP2 version
 
 
 # folders and files
 # folders and files
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
 input_folder = downloaded_data_path / 'UNFCCC' / 'Argentina' / \
                'BUR4'
                'BUR4'
 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'
 output_folder = extracted_data_path / 'UNFCCC' / 'Argentina'

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Chile/read_CHL_BUR4_from_xlsx.py

@@ -5,9 +5,9 @@ import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
 from config_CHL_BUR4 import cat_mapping, filter_remove_IPCC2006, aggregate_cats
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import filter_data
 from primap2.pm2io._data_reading import filter_data
 
 
@@ -16,11 +16,6 @@ from primap2.pm2io._data_reading import filter_data
 # ###
 # ###
 
 
 # folders and files
 # folders and files
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Chile' / 'BUR4'
 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
 output_folder = extracted_data_path / 'UNFCCC' / 'Chile'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Colombia/read_COL_BUR3_from_xlsx.py

@@ -4,19 +4,12 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Colombia' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
 output_folder = extracted_data_path / 'UNFCCC' / 'Colombia'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Indonesia/read_IDN_BUR3_from_pdf.py

@@ -4,21 +4,14 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import numpy as np
 import numpy as np
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Indonesia' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
 output_folder = extracted_data_path / 'UNFCCC' / 'Indonesia'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Mexico/read_MEX_BUR3_from_pdf.py

@@ -3,20 +3,13 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 from config_MEX_BUR3 import page_defs, fix_rows
 from config_MEX_BUR3 import page_defs, fix_rows
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Mexico' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
 output_folder = extracted_data_path / 'UNFCCC' / 'Mexico'
 if not output_folder.exists():
 if not output_folder.exists():

+ 2 - 8
UNFCCC_GHG_data/UNFCCC_reader/Morocco/read_MAR_BUR3_from_pdf.py

@@ -5,25 +5,19 @@ import camelot
 import primap2 as pm2
 import primap2 as pm2
 import pandas as pd
 import pandas as pd
 import copy
 import copy
-from pathlib import Path
+
 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
 from config_MAR_BUR3 import zero_cats, cat_mapping, aggregate_cats, remove_cats, \
     table_defs, header_defs
     table_defs, header_defs
 from primap2.pm2io._data_reading import matches_time_format, filter_data
 from primap2.pm2io._data_reading import matches_time_format, filter_data
+from UNFCCC_GHG_data.helper import extracted_data_path, downloaded_data_path
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Morocco' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
 output_folder = extracted_data_path / 'UNFCCC' / 'Morocco'
 output_filename = 'MAR_BUR3_2022_'
 output_filename = 'MAR_BUR3_2022_'
-
 inventory_file = 'Morocco_BUR3_Fr.pdf'
 inventory_file = 'Morocco_BUR3_Fr.pdf'
-
 gwp_to_use = 'AR4GWP100'
 gwp_to_use = 'AR4GWP100'
 
 
 # years to read
 # years to read

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_2021-Inventory_from_xlsx.py

@@ -5,24 +5,17 @@ import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
 from config_KOR_BUR4 import remove_cats, aggregate_before_mapping, cat_mapping, \
     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
     aggregate_after_mapping, coords_terminologies_2006, filter_remove_2006, \
     filter_remove_after_agg
     filter_remove_after_agg
-
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 from primap2.pm2io._data_reading import filter_data, matches_time_format
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
                '2021-Inventory'
                '2021-Inventory'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Republic_of_Korea'

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/Republic_of_Korea/read_KOR_BUR4_from_xlsx.py

@@ -5,19 +5,14 @@ import os
 import sys
 import sys
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from config_KOR_BUR4 import cat_name_translations, cat_codes
 from primap2.pm2io._data_reading import filter_data
 from primap2.pm2io._data_reading import filter_data
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Republic_of_Korea' / \
                '2020-Inventory'
                '2020-Inventory'
 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'
 output_folder = extracted_data_path / 'UNFCCC' / 'Republic_of_Korea'

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Taiwan/read_TWN_2022-Inventory_from_pdf.py

@@ -3,11 +3,10 @@
 
 
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import copy
 import copy
-#import re
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
 from config_TWN_NIR2022 import table_defs, page_defs
 from config_TWN_NIR2022 import table_defs, page_defs
@@ -17,12 +16,6 @@ from config_TWN_NIR2022 import gwp_to_use
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
 input_folder = downloaded_data_path / 'non-UNFCCC' / 'Taiwan'
 # TODO: move file to subfolder
 # TODO: move file to subfolder
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'
 output_folder = extracted_data_path / 'non-UNFCCC' / 'Taiwan'

+ 1 - 8
UNFCCC_GHG_data/UNFCCC_reader/Thailand/read_THA_BUR3_from_pdf.py

@@ -1,23 +1,16 @@
 # this script reads data from Thailand's BUR3
 # this script reads data from Thailand's BUR3
 # Data is read from the pdf file
 # Data is read from the pdf file
-
 import pandas as pd
 import pandas as pd
 import primap2 as pm2
 import primap2 as pm2
-from pathlib import Path
 import camelot
 import camelot
 import copy
 import copy
 
 
+from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path
 from primap2.pm2io._data_reading import matches_time_format
 from primap2.pm2io._data_reading import matches_time_format
 
 
 # ###
 # ###
 # configuration
 # configuration
 # ###
 # ###
-root_path = Path(__file__).parents[3].absolute()
-root_path = root_path.resolve()
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-
-
 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
 input_folder = downloaded_data_path / 'UNFCCC' / 'Thailand' / 'BUR3'
 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
 output_folder = extracted_data_path / 'UNFCCC' / 'Thailand'
 if not output_folder.exists():
 if not output_folder.exists():

+ 1 - 6
UNFCCC_GHG_data/UNFCCC_reader/__init__.py

@@ -1,6 +1 @@
-# expose some of the functions to the outside as they are used in other readers as well
-# TODO: create a unified util module for all readers
-
-from .get_submissions_info import get_country_code
-
-__all__ = ["get_country_code"]
+#

+ 3 - 437
UNFCCC_GHG_data/UNFCCC_reader/get_submissions_info.py

@@ -5,324 +5,11 @@ from typing import List, Dict
 from pathlib import Path
 from pathlib import Path
 import json
 import json
 import pycountry
 import pycountry
-#import os
 
 
-root_path = Path(__file__).parents[2].absolute()
-root_path = root_path.resolve()
-code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
-# beware, folders below are different than for CRF reader
-downloaded_data_path = root_path / "downloaded_data"
-extracted_data_path = root_path / "extracted_data"
-legacy_data_path = root_path / "legacy_data"
-
-# TODO: move this to general util package
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom of Great Britain and Northern Ireland",
-}
-
-custom_folders = {
-    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
-    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
-    'Micronesia_(Federated_State_of)': 'FSM',
-    'Micronesia_(Federated_States_of)': 'FSM',
-    'The_Republic_of_North_Macedonia': 'MKD',
-    'Republic_of_Korea': 'KOR',
-    'Bolivia_(Plurinational_State_of)': 'BOL',
-    'Türkiye': 'TUR',
-    'Iran_(Islamic_Republic_of)': 'IRN',
-    'Côte_d’Ivoire': 'CIV',
-    'Democratic_Republic_of_the_Congo': "COD",
-    'European_Union': 'EUA',
-    'Taiwan': 'TWN',
-}
-
-def get_country_submissions(
-        country_name: str,
-        print_sub: bool = True,
-) -> Dict[str, List[str]]:
-    """
-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
-    queries the folder mapping files for folders.
-
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        print_sub: bool
-            If True information on submissions will be written to stdout
-
-    Returns
-    -------
-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
-        Each value is a list of folders
-
-    """
-
-    data_folder = downloaded_data_path
-
-    country_code = get_country_code(country_name)
-
-    if print_sub:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
-
-    country_submissions = {}
-    if print_sub:
-        print(f"#" * 80)
-        print(f"The following submissions are available for {country_name}")
-    for item in data_folder.iterdir():
-        if item.is_dir():
-            if print_sub:
-                print("")
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code in folder_mapping:
-                country_folders = folder_mapping[country_code]
-                if isinstance(country_folders, str):
-                    # only one folder
-                    country_folders = [country_folders]
-
-                submission_folders = []
-                for country_folder in country_folders:
-                    current_folder = item / country_folder
-                    if print_sub:
-                        print(f"Submissions in folder {country_folder}:")
-
-                    for submission_folder in current_folder.iterdir():
-                        if submission_folder.is_dir():
-                            if print_sub:
-                                print(submission_folder.name)
-                            submission_folders.append(submission_folder.name)
-
-                country_submissions[item.name] = submission_folders
-            else:
-                print(f"No submissions available for {country_name}.")
-
-    return country_submissions
-
-
-def get_country_datasets(
-        country_name: str,
-        print_ds: bool = True,
-) -> Dict[str, List[str]]:
-    """
-    Input is a three letter ISO UNFCCC_GHG_data for a country, or the country's name.
-    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
-    checks the UNFCCC_GHG_data and data folders for content on the country.
-
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        print_ds: bool
-            If True information on submissions will be written to stdout
-
-    Returns
-    -------
-        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
-        Each value is a list of folders
-
-    """
-
-    data_folder = extracted_data_path
-    data_folder_legacy = legacy_data_path
-
-
-    # obtain country UNFCCC_GHG_data
-    country_code = get_country_code(country_name)
-
-    if print_ds:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
+from UNFCCC_GHG_data.helper import root_path, downloaded_data_path, extracted_data_path
+from UNFCCC_GHG_data.helper import get_country_code
 
 
-    rep_data = {}
-    # data
-    if print_ds:
-        print(f"#" * 80)
-        print(f"The following datasets are available for {country_name}")
-    for item in data_folder.iterdir():
-        if item.is_dir():
-            cleaned_datasets_current_folder = {}
-            if print_ds:
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code not in folder_mapping:
-                if print_ds:
-                    print("No data available")
-                    print("")
-            else:
-                country_folder = folder_mapping[country_code]
-                if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
-
-                datasets_current_folder = {}
-                current_folder = item / country_folder
-
-                for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
-                        if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
-                        else:
-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
-
-                for dataset in datasets_current_folder:
-                    # process filename to get submission
-                    parts = dataset.split('_')
-                    if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
-                    else:
-                        terminology = "_".join(parts[3 : ])
-                        key = f"{parts[1]} ({parts[2]}, {terminology})"
-                        data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
-                            data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
-                            data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF (.yaml), "
-
-                        code_file = get_code_file(country_code, parts[1])
-                        if code_file:
-                            data_info = data_info + f"UNFCCC_GHG_data: {code_file.name}"
-                        else:
-                            data_info = data_info + f"UNFCCC_GHG_data: not found"
-
-                        cleaned_datasets_current_folder[key] = data_info
-
-                if print_ds:
-                    if cleaned_datasets_current_folder:
-                        for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
-                    else:
-                        print("No data available")
-                    print("")
-
-            rep_data[item.name] = cleaned_datasets_current_folder
-
-    # legacy data
-    if print_ds:
-        print(f"#" * 80)
-        print(f"The following legacy datasets are available for {country_name}")
-    legacy_data = {}
-    for item in data_folder_legacy.iterdir():
-        if item.is_dir():
-            cleaned_datasets_current_folder = {}
-            if print_ds:
-                print("-" * 80)
-                print(f"Data folder {item.name}")
-                print("-" * 80)
-            with open(item / "folder_mapping.json", "r") as mapping_file:
-                folder_mapping = json.load(mapping_file)
-            if country_code not in folder_mapping:
-                if print_ds:
-                    print("No data available")
-                    print("")
-            else:
-                country_folder = folder_mapping[country_code]
-                if not isinstance(country_folder, str):
-                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
-
-                datasets_current_folder = {}
-                current_folder = item / country_folder
-
-                for data_file in current_folder.iterdir():
-                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
-                        if data_file.stem in datasets_current_folder:
-                            datasets_current_folder[data_file.stem].append(data_file.suffix)
-                        else:
-                            datasets_current_folder[data_file.stem] = [data_file.suffix]
-
-                for dataset in datasets_current_folder:
-                    # process filename to get submission
-                    parts = dataset.split('_')
-                    if parts[0] != country_code:
-                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
-                    else:
-                        terminology = "_".join(parts[3 : ])
-                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
-                        data_info = ""
-                        if '.nc' in datasets_current_folder[dataset]:
-                            data_info = data_info + "NF (.nc), "
-                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
-                            data_info = data_info + "IF (.yaml + .csv), "
-                        elif '.csv' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF? (.csv), "
-                        elif '.yaml' in datasets_current_folder[dataset]:
-                            data_info = data_info + "incomplete IF (.yaml), "
-
-                        cleaned_datasets_current_folder[key] = data_info
-
-                if print_ds:
-                    if cleaned_datasets_current_folder:
-                        for country_ds in cleaned_datasets_current_folder:
-                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
-                    else:
-                        print("No data available")
-                    print("")
-
-                legacy_data[item.name] = cleaned_datasets_current_folder
-
-    all_data = {
-        "rep_data": rep_data,
-        "legacy_data": legacy_data,
-    }
-
-    return all_data
-
-
-def get_country_code(
-        country_name: str,
-)->str:
-    """
-    obtain country UNFCCC_GHG_data. If the input is a UNFCCC_GHG_data it will be returned, if the input
-    is not a three letter UNFCCC_GHG_data a search will be performed
-
-    Parameters
-    __________
-    country_name: str
-        Country UNFCCC_GHG_data or name to get the three-letter UNFCCC_GHG_data for.
-
-    """
-    # First check if it's in the list of custom codes
-    if country_name in custom_country_mapping:
-        country_code = country_name
-    else:
-        try:
-            # check if it's a 3 letter UNFCCC_GHG_data
-            country = pycountry.countries.get(alpha_3=country_name)
-            country_code = country.alpha_3
-        except:
-            try:
-                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
-            except:
-                raise ValueError(f"Country name {country_name} can not be mapped to "
-                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
-            if len(country) > 1:
-                country_code = None
-                for current_country in country:
-                    if current_country.name == country_name:
-                        country_code = current_country.alpha_3
-                if country_code is None:
-                    raise ValueError(f"Country name {country_name} has {len(country)} "
-                                     f"possible results for country codes.")
-
-            country_code = country[0].alpha_3
-
-    return country_code
+code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
 
 
 
 
 def get_possible_inputs(
 def get_possible_inputs(
@@ -446,128 +133,7 @@ def get_possible_outputs(
     return output_files
     return output_files
 
 
 
 
-def get_code_file(
-        country_name: str,
-        submission: str,
-        print_info: bool = False,
-) -> Path:
-    """
-    For given country name and submission find the script that creates the data
 
 
-    Parameters
-    ----------
-        country_name: str
-            String containing the country name or ISO 3 letter UNFCCC_GHG_data
-
-        submission: str
-            String of the submission
-
-        print_info: bool = False
-            If True print information on UNFCCC_GHG_data found
-
-    Returns
-    -------
-        returns a pathlib Path object for the UNFCCC_GHG_data file
-    """
-
-    code_file_path = None
-
-    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
-    # so we return the path to that.
-    if submission[0:3] == "CRF":
-        return root_path / "UNFCCC_CRF_reader"
 
 
-    # obtain country UNFCCC_GHG_data
-    country_code = get_country_code(country_name)
-
-    if print_info:
-        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
-
-    with open(code_path / "folder_mapping.json", "r") as mapping_file:
-        folder_mapping = json.load(mapping_file)
-
-    if country_code not in folder_mapping:
-        if print_info:
-            print("No UNFCCC_GHG_data available")
-            print("")
-    else:
-        country_folder = code_path / folder_mapping[country_code]
-        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
-
-        for file in country_folder.iterdir():
-            if file.match(code_file_name_candidate):
-                if code_file_path is not None:
-                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
-                                     f"{code_file_path} and file.name. "
-                                     f"Please use only one file with name "
-                                     f"'read_ISO3_submission_XXX.YYY'.")
-                else:
-                    if print_info:
-                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
-                code_file_path = file
-
-    if code_file_path is not None:
-        return code_file_path.relative_to(root_path)
-    else:
-        return None
-
-
-def create_folder_mapping(
-        folder: str,
-        extracted: bool = False
-) -> None:
-    """
-    Create a mapping from 3 letter ISO country codes to folders
-    based on the subfolders of the given folder. The mapping is
-    stored in 'folder_mapping.json' in the given folder. Folder
-    must be given relative to the repository root
 
 
-    Parameters
-    ----------
-        folder: str
-            folder to create the mapping for
-        extracted: bool = False
-            If true treat the folder as extracted data, where we
-            only have one folder per country and no typos in the
-            names
-
-    Returns
-    -------
-        Nothing
-
-    """
 
 
-    folder = root_path / folder
-    folder_mapping = {}
-    #if not extracted:
-    known_folders = custom_folders
-    #else:
-    #    known_folders = {}
-
-    for item in folder.iterdir():
-        if item.is_dir() and not item.match("__pycache__"):
-            if item.name in known_folders:
-                ISO3 = known_folders[item.name]
-            else:
-                try:
-                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
-                    if len(country) > 1:
-                        ISO3 = None
-                        for current_country in country:
-                            if current_country.name == item.name.replace("_", " "):
-                                ISO3 = current_country.alpha_3
-                    else:
-                        ISO3 = country[0].alpha_3
-                except:
-                    ISO3 = None
-
-            if ISO3 is None:
-                print(f"No match for {item.name}")
-            else:
-                if ISO3 in folder_mapping.keys():
-                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
-                else:
-                    folder_mapping[ISO3] = item.name
-
-    with open(folder / "folder_mapping.json", "w") as mapping_file:
-        json.dump(folder_mapping, mapping_file, indent=4)

+ 3 - 9
UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py

@@ -1,15 +1,12 @@
 # this script takes submission and country as input (from doit) and
 # this script takes submission and country as input (from doit) and
 # runs the appropriate script to extract the submission data
 # runs the appropriate script to extract the submission data
 
 
-import sys
 import datalad.api
 import datalad.api
-from pathlib import Path
 import argparse
 import argparse
 from get_submissions_info import get_code_file
 from get_submissions_info import get_code_file
 from get_submissions_info import get_possible_inputs
 from get_submissions_info import get_possible_inputs
 from get_submissions_info import get_possible_outputs
 from get_submissions_info import get_possible_outputs
-
-
+from UNFCCC_GHG_data.helper import root_path
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.
@@ -22,9 +19,6 @@ args = parser.parse_args()
 country = args.country
 country = args.country
 submission = args.submission
 submission = args.submission
 
 
-codepath = Path(__file__).parent
-rootpath = codepath / ".." / ".."
-rootpath = rootpath.resolve()
 
 
 print(f"Attempting to extract data for {submission} from {country}.")
 print(f"Attempting to extract data for {submission} from {country}.")
 print("#"*80)
 print("#"*80)
@@ -49,7 +43,7 @@ if script_name is not None:
         print("")
         print("")
     # make input files absolute to avoid datalad confusions when
     # make input files absolute to avoid datalad confusions when
     # root directory is via symlink
     # root directory is via symlink
-    input_files = [rootpath / file for file in input_files]
+    input_files = [root_path / file for file in input_files]
     # convert file's path to str
     # convert file's path to str
     input_files = [file.as_posix() for file in input_files]
     input_files = [file.as_posix() for file in input_files]
 
 
@@ -69,7 +63,7 @@ if script_name is not None:
     print(f"Run the script using datalad run via the python api")
     print(f"Run the script using datalad run via the python api")
     datalad.api.run(
     datalad.api.run(
         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
         cmd=f"./venv/bin/python3 {script_name.as_posix()}",
-        dataset=rootpath,
+        dataset=root_path,
         message=f"Read data for {country}, {submission}.",
         message=f"Read data for {country}, {submission}.",
         inputs=input_files,
         inputs=input_files,
         outputs=output_files,
         outputs=output_files,

+ 6 - 1
UNFCCC_GHG_data/__init__.py

@@ -2,7 +2,12 @@
 
 
 from . import UNFCCC_reader
 from . import UNFCCC_reader
 from . import UNFCCC_CRF_reader
 from . import UNFCCC_CRF_reader
+from . import helper
 # import UNFCCC_DI_reader
 # import UNFCCC_DI_reader
 # import UNFCCC_downloader
 # import UNFCCC_downloader
 
 
-__all__ = ["UNFCCC_reader", "UNFCCC_CRF_reader"]
+__all__ = [
+    "UNFCCC_reader",
+    "UNFCCC_CRF_reader",
+    "helper",
+]

+ 24 - 0
UNFCCC_GHG_data/helper/__init__.py

@@ -0,0 +1,24 @@
+from .definitions import root_path, code_path, log_path
+from .definitions import extracted_data_path, extracted_data_path_UNFCCC
+from .definitions import legacy_data_path
+from .definitions import downloaded_data_path, downloaded_data_path_UNFCCC
+from .definitions import dataset_path, dataset_path_UNFCCC
+from .definitions import custom_country_mapping, custom_folders
+from .functions import get_country_code, get_country_name
+
+__all__ = [
+    "root_path",
+    "code_path",
+    "log_path",
+    "extracted_data_path",
+    "extracted_data_path_UNFCCC",
+    "legacy_data_path",
+    "downloaded_data_path",
+    "downloaded_data_path_UNFCCC",
+    "dataset_path",
+    "dataset_path_UNFCCC",
+    "custom_country_mapping",
+    "custom_folders",
+    "get_country_code",
+    "get_country_name",
+]

+ 2 - 2
UNFCCC_GHG_data/UNFCCC_reader/country_info.py → UNFCCC_GHG_data/helper/country_info.py

@@ -2,8 +2,8 @@
 # runs displays available submissions and datasets
 # runs displays available submissions and datasets
 
 
 import argparse
 import argparse
-from get_submissions_info import get_country_submissions
-from get_submissions_info import get_country_datasets
+from UNFCCC_GHG_data.helper.functions import get_country_submissions
+from UNFCCC_GHG_data.helper.functions import get_country_datasets
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.

+ 49 - 0
UNFCCC_GHG_data/helper/definitions.py

@@ -0,0 +1,49 @@
+import os
+from pathlib import Path
+
+
+def get_root_path() -> Path:
+    """ get the root_path from an environment variable """
+    root_path_env = os.getenv('UNFCCC_GHG_ROOT_PATH', None)
+    if root_path_env is None:
+        raise ValueError('UNFCCC_GHG_ROOT_PATH environment variable needs to be set')
+    else:
+        root_path = Path(root_path_env).resolve()
+    return root_path
+
+root_path = get_root_path()
+code_path = root_path / "UNFCCC_GHG_data"
+log_path = root_path / "log"
+extracted_data_path = root_path / "extracted_data"
+extracted_data_path_UNFCCC = extracted_data_path / "UNFCCC"
+downloaded_data_path = root_path / "downloaded_data"
+downloaded_data_path_UNFCCC = downloaded_data_path / "UNFCCC"
+legacy_data_path = root_path / "legacy_data"
+dataset_path = root_path / "datasets"
+dataset_path_UNFCCC = dataset_path / "UNFCCC"
+
+
+custom_country_mapping = {
+    "EUA": "European Union",
+    "EUC": "European Union",
+    "FRK": "France",
+    "DKE": "Denmark",
+    "DNM": "Denmark",
+    "GBK": "United Kingdom of Great Britain and Northern Ireland",
+}
+
+custom_folders = {
+    'Venezeula_(Bolivarian_Republic_of)': 'VEN',
+    'Venezuela_(Bolivarian_Republic_of)': 'VEN',
+    'Micronesia_(Federated_State_of)': 'FSM',
+    'Micronesia_(Federated_States_of)': 'FSM',
+    'The_Republic_of_North_Macedonia': 'MKD',
+    'Republic_of_Korea': 'KOR',
+    'Bolivia_(Plurinational_State_of)': 'BOL',
+    'Türkiye': 'TUR',
+    'Iran_(Islamic_Republic_of)': 'IRN',
+    'Côte_d’Ivoire': 'CIV',
+    'Democratic_Republic_of_the_Congo': "COD",
+    'European_Union': 'EUA',
+    'Taiwan': 'TWN',
+}

+ 1 - 1
UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py → UNFCCC_GHG_data/helper/folder_mapping.py

@@ -3,7 +3,7 @@
 # oir that folder
 # oir that folder
 
 
 import argparse
 import argparse
-from get_submissions_info import create_folder_mapping
+from .functions import create_folder_mapping
 
 
 # Find the right function and possible input and output files and
 # Find the right function and possible input and output files and
 # read the data using datalad run.
 # read the data using datalad run.

+ 445 - 0
UNFCCC_GHG_data/helper/functions.py

@@ -0,0 +1,445 @@
+import pycountry
+import json
+from typing import Dict, List
+from pathlib import Path
+from .definitions import custom_country_mapping, custom_folders
+from .definitions import root_path, downloaded_data_path, extracted_data_path
+from .definitions import legacy_data_path, code_path
+
+
+def get_country_name(
+        country_code: str,
+) -> str:
+    """get country name from code """
+    if country_code in custom_country_mapping:
+        country_name = custom_country_mapping[country_code]
+    else:
+        try:
+            country = pycountry.countries.get(alpha_3=country_code)
+            country_name = country.name
+        except:
+            raise ValueError(f"Country code {country_code} can not be mapped to "
+                             f"any country")
+
+    return country_name
+
+
+def get_country_code(
+        country_name: str,
+)->str:
+    """
+    obtain country code. If the input is a code it will be returned,
+    if the input
+    is not a three letter code a search will be performed
+
+    Parameters
+    __________
+    country_name: str
+        Country code or name to get the three-letter code for.
+
+    Returns
+    -------
+        country_code: str
+
+    """
+    # First check if it's in the list of custom codes
+    if country_name in custom_country_mapping:
+        country_code = country_name
+    else:
+        try:
+            # check if it's a 3 letter UNFCCC_GHG_data
+            country = pycountry.countries.get(alpha_3=country_name)
+            country_code = country.alpha_3
+        except:
+            try:
+                country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
+            except:
+                raise ValueError(f"Country name {country_name} can not be mapped to "
+                                 f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
+            if len(country) > 1:
+                country_code = None
+                for current_country in country:
+                    if current_country.name == country_name:
+                        country_code = current_country.alpha_3
+                if country_code is None:
+                    raise ValueError(f"Country name {country_name} has {len(country)} "
+                                     f"possible results for country codes.")
+
+            country_code = country[0].alpha_3
+
+    return country_code
+
+
+def create_folder_mapping(
+        folder: str,
+        extracted: bool = False
+) -> None:
+    """
+    Create a mapping from 3 letter ISO country codes to folders
+    based on the subfolders of the given folder. The mapping is
+    stored in 'folder_mapping.json' in the given folder. Folder
+    must be given relative to the repository root
+
+    Parameters
+    ----------
+        folder: str
+            folder to create the mapping for
+        extracted: bool = False
+            If true treat the folder as extracted data, where we
+            only have one folder per country and no typos in the
+            names
+
+    Returns
+    -------
+        Nothing
+
+    """
+
+    folder = root_path / folder
+    folder_mapping = {}
+    #if not extracted:
+    known_folders = custom_folders
+    #else:
+    #    known_folders = {}
+
+    for item in folder.iterdir():
+        if item.is_dir() and not item.match("__pycache__"):
+            if item.name in known_folders:
+                ISO3 = known_folders[item.name]
+            else:
+                try:
+                    country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
+                    if len(country) > 1:
+                        ISO3 = None
+                        for current_country in country:
+                            if current_country.name == item.name.replace("_", " "):
+                                ISO3 = current_country.alpha_3
+                    else:
+                        ISO3 = country[0].alpha_3
+                except:
+                    ISO3 = None
+
+            if ISO3 is None:
+                print(f"No match for {item.name}")
+            else:
+                if ISO3 in folder_mapping.keys():
+                    folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
+                else:
+                    folder_mapping[ISO3] = item.name
+
+    with open(folder / "folder_mapping.json", "w") as mapping_file:
+        json.dump(folder_mapping, mapping_file, indent=4)
+
+
+# TODO add crf
+def get_country_submissions(
+        country_name: str,
+        print_sub: bool = True,
+) -> Dict[str, List[str]]:
+    """
+    Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
+    queries the folder mapping files for folders.
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
+
+        print_sub: bool
+            If True information on submissions will be written to stdout
+
+    Returns
+    -------
+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
+        Each value is a list of folders
+
+    """
+
+    data_folder = downloaded_data_path
+
+    country_code = get_country_code(country_name)
+
+    if print_sub:
+        print(f"Country name {country_name} maps to ISO code {country_code}")
+
+    country_submissions = {}
+    if print_sub:
+        print(f"#" * 80)
+        print(f"The following submissions are available for {country_name}")
+    for item in data_folder.iterdir():
+        if item.is_dir():
+            if print_sub:
+                print("")
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code in folder_mapping:
+                country_folders = folder_mapping[country_code]
+                if isinstance(country_folders, str):
+                    # only one folder
+                    country_folders = [country_folders]
+
+                submission_folders = []
+                for country_folder in country_folders:
+                    current_folder = item / country_folder
+                    if print_sub:
+                        print(f"Submissions in folder {country_folder}:")
+
+                    for submission_folder in current_folder.iterdir():
+                        if submission_folder.is_dir():
+                            if print_sub:
+                                print(submission_folder.name)
+                            submission_folders.append(submission_folder.name)
+
+                country_submissions[item.name] = submission_folders
+            else:
+                print(f"No submissions available for {country_name}.")
+
+    return country_submissions
+
+
+def get_country_datasets(
+        country_name: str,
+        print_ds: bool = True,
+) -> Dict[str, List[str]]:
+    """
+    Input is a three letter ISO code for a country, or the country's name.
+    The function tries to map the country name to an ISO UNFCCC_GHG_data and then
+    checks the UNFCCC_GHG_data and data folders for content on the country.
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter code
+
+        print_ds: bool
+            If True information on submissions will be written to stdout
+
+    Returns
+    -------
+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
+        Each value is a list of folders
+
+    """
+
+    data_folder = extracted_data_path
+    data_folder_legacy = legacy_data_path
+
+    # obtain country UNFCCC_GHG_data
+    country_code = get_country_code(country_name)
+
+    if print_ds:
+        print(f"Country name {country_name} maps to ISO code {country_code}")
+
+    rep_data = {}
+    # data
+    if print_ds:
+        print(f"#" * 80)
+        print(f"The following datasets are available for {country_name}")
+    for item in data_folder.iterdir():
+        if item.is_dir():
+            cleaned_datasets_current_folder = {}
+            if print_ds:
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code not in folder_mapping:
+                if print_ds:
+                    print("No data available")
+                    print("")
+            else:
+                country_folder = folder_mapping[country_code]
+                if not isinstance(country_folder, str):
+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+
+                datasets_current_folder = {}
+                current_folder = item / country_folder
+
+                for data_file in current_folder.iterdir():
+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                        if data_file.stem in datasets_current_folder:
+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                        else:
+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
+
+                for dataset in datasets_current_folder:
+                    # process filename to get submission
+                    parts = dataset.split('_')
+                    if parts[0] != country_code:
+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
+                            dataset
+                    else:
+                        terminology = "_".join(parts[3 : ])
+                        key = f"{parts[1]} ({parts[2]}, {terminology})"
+                        data_info = ""
+                        if '.nc' in datasets_current_folder[dataset]:
+                            data_info = data_info + "NF (.nc), "
+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                            data_info = data_info + "IF (.yaml + .csv), "
+                        elif '.csv' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF? (.csv), "
+                        elif '.yaml' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF (.yaml), "
+
+                        code_file = get_code_file(country_code, parts[1])
+                        if code_file:
+                            data_info = data_info + f"code: {code_file.name}"
+                        else:
+                            data_info = data_info + f"code: not found"
+
+                        cleaned_datasets_current_folder[key] = data_info
+
+                if print_ds:
+                    if cleaned_datasets_current_folder:
+                        for country_ds in cleaned_datasets_current_folder:
+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                    else:
+                        print("No data available")
+                    print("")
+
+            rep_data[item.name] = cleaned_datasets_current_folder
+
+    # legacy data
+    if print_ds:
+        print(f"#" * 80)
+        print(f"The following legacy datasets are available for {country_name}")
+    legacy_data = {}
+    for item in data_folder_legacy.iterdir():
+        if item.is_dir():
+            cleaned_datasets_current_folder = {}
+            if print_ds:
+                print("-" * 80)
+                print(f"Data folder {item.name}")
+                print("-" * 80)
+            with open(item / "folder_mapping.json", "r") as mapping_file:
+                folder_mapping = json.load(mapping_file)
+            if country_code not in folder_mapping:
+                if print_ds:
+                    print("No data available")
+                    print("")
+            else:
+                country_folder = folder_mapping[country_code]
+                if not isinstance(country_folder, str):
+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
+
+                datasets_current_folder = {}
+                current_folder = item / country_folder
+
+                for data_file in current_folder.iterdir():
+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
+                        if data_file.stem in datasets_current_folder:
+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
+                        else:
+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
+
+                for dataset in datasets_current_folder:
+                    # process filename to get submission
+                    parts = dataset.split('_')
+                    if parts[0] != country_code:
+                        cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
+                    else:
+                        terminology = "_".join(parts[3 : ])
+                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
+                        data_info = ""
+                        if '.nc' in datasets_current_folder[dataset]:
+                            data_info = data_info + "NF (.nc), "
+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
+                            data_info = data_info + "IF (.yaml + .csv), "
+                        elif '.csv' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF? (.csv), "
+                        elif '.yaml' in datasets_current_folder[dataset]:
+                            data_info = data_info + "incomplete IF (.yaml), "
+
+                        cleaned_datasets_current_folder[key] = data_info
+
+                if print_ds:
+                    if cleaned_datasets_current_folder:
+                        for country_ds in cleaned_datasets_current_folder:
+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
+                    else:
+                        print("No data available")
+                    print("")
+
+                legacy_data[item.name] = cleaned_datasets_current_folder
+
+    all_data = {
+        "rep_data": rep_data,
+        "legacy_data": legacy_data,
+    }
+
+    return all_data
+
+
+def get_code_file(
+        country_name: str,
+        submission: str,
+        print_info: bool = False,
+) -> Path:
+    """
+    For given country name and submission find the script that creates the data
+
+    Parameters
+    ----------
+        country_name: str
+            String containing the country name or ISO 3 letter UNFCCC_GHG_data
+
+        submission: str
+            String of the submission
+
+        print_info: bool = False
+            If True print information on UNFCCC_GHG_data found
+
+    Returns
+    -------
+        returns a pathlib Path object for the UNFCCC_GHG_data file
+    """
+
+    code_file_path = None
+    UNFCCC_reader_path = code_path / "UNFCCC_reader"
+
+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
+    # so we return the path to that.
+    if submission[0:3] == "CRF":
+        return root_path / "UNFCCC_CRF_reader"
+
+    if submission[0:2] == "DI":
+        return root_path / "UNFCCC_DI_reader"
+
+    # obtain country UNFCCC_GHG_data
+    country_code = get_country_code(country_name)
+
+    if print_info:
+        print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
+
+    with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code not in folder_mapping:
+        if print_info:
+            print("No UNFCCC_GHG_data available")
+            print("")
+    else:
+        country_folder = UNFCCC_reader_path / folder_mapping[country_code]
+        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
+
+        for file in country_folder.iterdir():
+            if file.match(code_file_name_candidate):
+                if code_file_path is not None:
+                    raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
+                                     f"{code_file_path} and file.name. "
+                                     f"Please use only one file with name "
+                                     f"'read_ISO3_submission_XXX.YYY'.")
+                else:
+                    if print_info:
+                        print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
+                code_file_path = file
+
+    if code_file_path is not None:
+        return code_file_path.relative_to(root_path)
+    else:
+        return None

+ 38 - 11
dodo.py

@@ -1,5 +1,6 @@
 # define tasks for UNFCCC data repository
 # define tasks for UNFCCC data repository
 from doit import get_var
 from doit import get_var
+import os
 
 
 # TODO: task for folder mapping
 # TODO: task for folder mapping
 
 
@@ -18,6 +19,18 @@ def task_setup_venv():
         'verbosity': 2,
         'verbosity': 2,
     }
     }
 
 
+# set UNFCCC_GHG_ROOT_PATH environment variable
+def task_set_env():
+    """
+    Set the environment variable for the module so data is stored in the correct folders
+    """
+    def set_root_path():
+        os.environ["UNFCCC_GHG_ROOT_PATH"] = "."
+
+    return {
+        'actions': [set_root_path],
+    }
+
 
 
 # Task to create the mapping files which map folder names to ISO 3-letter country codes
 # Task to create the mapping files which map folder names to ISO 3-letter country codes
 read_config_folder = {
 read_config_folder = {
@@ -29,8 +42,9 @@ def task_map_folders():
     Create or update the folder mapping in the given folder
     Create or update the folder mapping in the given folder
     """
     """
     return {
     return {
-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder={read_config_folder['folder']}"],
                     f"--folder={read_config_folder['folder']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -44,6 +58,7 @@ def task_update_bur():
         'actions': ['datalad run -m "Fetch BUR submissions" '
         'actions': ['datalad run -m "Fetch BUR submissions" '
                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
                     '-o downloaded_data/UNFCCC/submissions-bur.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_bur.py'],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -58,9 +73,10 @@ def task_download_bur():
         'actions': ['datalad run -m "Download BUR submissions" '
         'actions': ['datalad run -m "Download BUR submissions" '
                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
                     '-i downloaded_data/UNFCCC/submissions-bur.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=BUR',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -73,6 +89,7 @@ def task_update_nc():
         'actions': ['datalad run -m "Fetch NC submissions" '
         'actions': ['datalad run -m "Fetch NC submissions" '
                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
                     '-o downloaded_data/UNFCCC/submissions-nc.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_nc.py'],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -87,9 +104,10 @@ def task_download_nc():
         'actions': ['datalad run -m "Download NC submissions" '
         'actions': ['datalad run -m "Download NC submissions" '
                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
                     '-i downloaded_data/UNFCCC/submissions-nc.csv '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_non-annexI.py --category=NC',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -112,6 +130,7 @@ def task_update_annexi():
                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"-o downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/fetch_submissions_annexI.py "
                     f"--year={update_aI_config['year']}"],
                     f"--year={update_aI_config['year']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -128,9 +147,10 @@ def task_download_annexi():
                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"-i downloaded_data/UNFCCC/submissions-annexI_{update_aI_config['year']}.csv "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
                     f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_annexI.py "
                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
                     f"--category={update_aI_config['category']} --year={update_aI_config['year']}",
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -141,9 +161,10 @@ def task_download_ndc():
     return {
     return {
         'actions': ['datalad run -m "Download NDC submissions" '
         'actions': ['datalad run -m "Download NDC submissions" '
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
                     './venv/bin/python UNFCCC_GHG_data/UNFCCC_downloader/download_ndc.py',
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=downloaded_data/UNFCCC"
                     f"--folder=downloaded_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -163,9 +184,10 @@ def task_read_unfccc_submission():
     return {
     return {
         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
         'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/read_UNFCCC_submission.py "
                     f"--country={read_config['country']} --submission={read_config['submission']}",
                     f"--country={read_config['country']} --submission={read_config['submission']}",
-                    f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+                    f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                     f"--folder=extracted_data/UNFCCC"
                     f"--folder=extracted_data/UNFCCC"
                     ],
                     ],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -188,13 +210,14 @@ def task_read_unfccc_crf_submission():
         f"--country={read_config_crf['country']} "
         f"--country={read_config_crf['country']} "
         f"--submission_year={read_config_crf['submission_year']} "
         f"--submission_year={read_config_crf['submission_year']} "
         f"--submission_date={read_config_crf['submission_date']} ",
         f"--submission_date={read_config_crf['submission_date']} ",
-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
         f"--folder=extracted_data/UNFCCC"
         f"--folder=extracted_data/UNFCCC"
         ]
         ]
     if read_config_crf["re_read"] == "True":
     if read_config_crf["re_read"] == "True":
         actions[0] = actions[0] + " --re_read"
         actions[0] = actions[0] + " --re_read"
     return {
     return {
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -205,7 +228,7 @@ def task_read_new_unfccc_crf_for_year():
     data not present yet. Only reads the latest updated submission for each country."""
     data not present yet. Only reads the latest updated submission for each country."""
     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
     actions = [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_CRF_reader/read_new_UNFCCC_CRF_for_year_datalad.py "
                f"--submission_year={read_config_crf['submission_year']} ",
                f"--submission_year={read_config_crf['submission_year']} ",
-               f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+               f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
                f"--folder=extracted_data/UNFCCC"
                f"--folder=extracted_data/UNFCCC"
                ]
                ]
     # specifying countries is currently disabled duo to problems with command line
     # specifying countries is currently disabled duo to problems with command line
@@ -217,6 +240,7 @@ def task_read_new_unfccc_crf_for_year():
     return {
     return {
         #'basename': "Read_CRF_year",
         #'basename': "Read_CRF_year",
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -235,11 +259,12 @@ def task_read_unfccc_di_for_country():
         f"./venv/bin/python "
         f"./venv/bin/python "
         f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py "
         f"UNFCCC_GHG_data/UNFCCC_DI_reader/read_UNFCCC_DI_for_country_datalad.py "
         f"--country={read_config_di['country']}",
         f"--country={read_config_di['country']}",
-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
         f"--folder=extracted_data/UNFCCC"
         f"--folder=extracted_data/UNFCCC"
         ]
         ]
     return {
     return {
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -250,11 +275,12 @@ def task_process_unfccc_di_for_country():
         f"./venv/bin/python "
         f"./venv/bin/python "
         f"UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py "
         f"UNFCCC_GHG_data/UNFCCC_DI_reader/process_UNFCCC_DI_for_country_datalad.py "
         f"--country={read_config_di['country']} --date={read_config_di['date']}",
         f"--country={read_config_di['country']} --date={read_config_di['date']}",
-        f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/folder_mapping.py "
+        f"./venv/bin/python UNFCCC_GHG_data/helper/folder_mapping.py "
         f"--folder=extracted_data/UNFCCC"
         f"--folder=extracted_data/UNFCCC"
         ]
         ]
     return {
     return {
         'actions': actions,
         'actions': actions,
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }
@@ -266,8 +292,9 @@ def task_country_info():
     """ Print information on submissions and datasets
     """ Print information on submissions and datasets
     available for given country"""
     available for given country"""
     return {
     return {
-        'actions': [f"./venv/bin/python UNFCCC_GHG_data/UNFCCC_reader/country_info.py "
+        'actions': [f"./venv/bin/python UNFCCC_GHG_data/helper/country_info.py "
                     f"--country={read_config['country']}"],
                     f"--country={read_config['country']}"],
+        'task_dep': ['set_env'],
         'verbosity': 2,
         'verbosity': 2,
         'setup': ['setup_venv'],
         'setup': ['setup_venv'],
     }
     }