%!s(int64=3) %!d(string=hai) anos · a3cf519cd0
--- a/code/UNFCCC_reader/folder_mapping.json
+++ b/code/UNFCCC_reader/folder_mapping.json
@@ -0,0 +1,4 @@
 
				+{
			
 
				+    "KOR": "Republic_of_Korea",
			
 
				+    "CHL": "Chile"
			
 
				+}
			
--- a/code/UNFCCC_reader/get_submissions_info.py
+++ b/code/UNFCCC_reader/get_submissions_info.py
@@ -1,11 +1,11 @@
 
				 # helper functions to get information on available submissions
			
 
				 # and data reading functions for a given country
			
 
				 
			
 
				-from typing import Union, List, Dict
			
 
				+from typing import List, Dict
			
 
				 from pathlib import Path
			
 
				 import json
			
 
				 import countrynames
			
 
				-import os
			
 
				+#import os
			
 
				 
			
 
				 
			
 
				 def get_country_submissions(
			
@@ -76,6 +76,387 @@ def get_country_submissions(
 
				     return country_submissions
			
 
				 
			
 
				 
			
 
				+def get_country_datasets(
			
 
				+        country_name: str,
			
 
				+        print_ds: bool = True,
			
 
				+) -> Dict[str, List[str]]:
			
 
				+    """
			
 
				+    Input is a three letter ISO code for a country, or the country's name.
			
 
				+    The function tries to map the country name to an ISO code and then
			
 
				+    checks the code and data folders for content on the country.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter code
			
 
				+
			
 
				+        print_ds: bool
			
 
				+            If True information on submissions will be written to stdout
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
			
 
				+        Each value is a list of folders
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    codepath = Path(__file__).parent
			
 
				+    #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
			
 
				+    rootpath = codepath / ".." / ".."
			
 
				+    rootpath = rootpath.resolve()
			
 
				+    data_folder = rootpath / "extracted_data"
			
 
				+    data_folder_legacy = rootpath / "legacy_data"
			
 
				+
			
 
				+
			
 
				+    # obtain country code
			
 
				+    country_code = countrynames.to_code_3(country_name)
			
 
				+    if country_code is None:
			
 
				+        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                         f"any country code")
			
 
				+
			
 
				+    if print_ds:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    rep_data = {}
			
 
				+    # data
			
 
				+    if print_ds:
			
 
				+        print(f"#" * 80)
			
 
				+        print(f"The following datasets are available for {country_name}")
			
 
				+    for item in data_folder.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            cleaned_datasets_current_folder = {}
			
 
				+            if print_ds:
			
 
				+                print("-" * 80)
			
 
				+                print(f"Data folder {item.name}")
			
 
				+                print("-" * 80)
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+            if country_code not in folder_mapping:
			
 
				+                if print_ds:
			
 
				+                    print("No data available")
			
 
				+                    print("")
			
 
				+            else:
			
 
				+                country_folder = folder_mapping[country_code]
			
 
				+                if not isinstance(country_folder, str):
			
 
				+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+
			
 
				+                datasets_current_folder = {}
			
 
				+                current_folder = item / country_folder
			
 
				+
			
 
				+                for data_file in current_folder.iterdir():
			
 
				+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                        if data_file.stem in datasets_current_folder:
			
 
				+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                        else:
			
 
				+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				+
			
 
				+                for dataset in datasets_current_folder:
			
 
				+                    # process filename to get submission
			
 
				+                    parts = dataset.split('_')
			
 
				+                    if parts[0] != country_code:
			
 
				+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
			
 
				+                    else:
			
 
				+                        terminology = "_".join(parts[3 : ])
			
 
				+                        key = f"{parts[1]} ({parts[2]}, {terminology})"
			
 
				+                        data_info = ""
			
 
				+                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "NF (.nc), "
			
 
				+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				+                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				+                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				+
			
 
				+                        code_file = get_code_file(country_code, parts[1])
			
 
				+                        if code_file:
			
 
				+                            data_info = data_info + f"code: {code_file.name}"
			
 
				+                        else:
			
 
				+                            data_info = data_info + f"code: not found"
			
 
				+
			
 
				+                        cleaned_datasets_current_folder[key] = data_info
			
 
				+
			
 
				+                if print_ds:
			
 
				+                    if cleaned_datasets_current_folder:
			
 
				+                        for country_ds in cleaned_datasets_current_folder:
			
 
				+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                    else:
			
 
				+                        print("No data available")
			
 
				+                    print("")
			
 
				+
			
 
				+            rep_data[item.name] = cleaned_datasets_current_folder
			
 
				+
			
 
				+    # legacy data
			
 
				+    if print_ds:
			
 
				+        print(f"#" * 80)
			
 
				+        print(f"The following legacy datasets are available for {country_name}")
			
 
				+    legacy_data = {}
			
 
				+    for item in data_folder_legacy.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            cleaned_datasets_current_folder = {}
			
 
				+            if print_ds:
			
 
				+                print("-" * 80)
			
 
				+                print(f"Data folder {item.name}")
			
 
				+                print("-" * 80)
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+            if country_code not in folder_mapping:
			
 
				+                if print_ds:
			
 
				+                    print("No data available")
			
 
				+                    print("")
			
 
				+            else:
			
 
				+                country_folder = folder_mapping[country_code]
			
 
				+                if not isinstance(country_folder, str):
			
 
				+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+
			
 
				+                datasets_current_folder = {}
			
 
				+                current_folder = item / country_folder
			
 
				+
			
 
				+                for data_file in current_folder.iterdir():
			
 
				+                    if data_file.suffix in ['.nc', '.yaml', '.csv']:
			
 
				+                        if data_file.stem in datasets_current_folder:
			
 
				+                            datasets_current_folder[data_file.stem].append(data_file.suffix)
			
 
				+                        else:
			
 
				+                            datasets_current_folder[data_file.stem] = [data_file.suffix]
			
 
				+
			
 
				+                for dataset in datasets_current_folder:
			
 
				+                    # process filename to get submission
			
 
				+                    parts = dataset.split('_')
			
 
				+                    if parts[0] != country_code:
			
 
				+                        cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
			
 
				+                    else:
			
 
				+                        terminology = "_".join(parts[3 : ])
			
 
				+                        key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
			
 
				+                        data_info = ""
			
 
				+                        if '.nc' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "NF (.nc), "
			
 
				+                        if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
			
 
				+                            data_info = data_info + "IF (.yaml + .csv), "
			
 
				+                        elif '.csv' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF? (.csv), "
			
 
				+                        elif '.yaml' in datasets_current_folder[dataset]:
			
 
				+                            data_info = data_info + "incomplete IF (.yaml), "
			
 
				+
			
 
				+                        cleaned_datasets_current_folder[key] = data_info
			
 
				+
			
 
				+                if print_ds:
			
 
				+                    if cleaned_datasets_current_folder:
			
 
				+                        for country_ds in cleaned_datasets_current_folder:
			
 
				+                            print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
			
 
				+                    else:
			
 
				+                        print("No data available")
			
 
				+                    print("")
			
 
				+
			
 
				+                legacy_data[item.name] = cleaned_datasets_current_folder
			
 
				+
			
 
				+    all_data = {
			
 
				+        "rep_data": rep_data,
			
 
				+        "legacy_data": legacy_data,
			
 
				+    }
			
 
				+
			
 
				+    return all_data
			
 
				+
			
 
				+
			
 
				+def get_possible_inputs(
			
 
				+        country_name: str,
			
 
				+        submission: str,
			
 
				+        print_info: bool = False,
			
 
				+) -> List[Path]:
			
 
				+
			
 
				+    """
			
 
				+    For given country name and submission find the possible input files
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter code
			
 
				+
			
 
				+        submission: str
			
 
				+            String of the submission
			
 
				+
			
 
				+        print_info: bool = False
			
 
				+            If True print information on code found
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a list pathlib Path objects for the input files
			
 
				+    """
			
 
				+
			
 
				+    codepath = Path(__file__).parent
			
 
				+    #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
			
 
				+    rootpath = codepath / ".." / ".."
			
 
				+    rootpath = rootpath.resolve()
			
 
				+    data_folder = rootpath / "downloaded_data"
			
 
				+
			
 
				+    # obtain country code
			
 
				+    country_code = countrynames.to_code_3(country_name)
			
 
				+    if country_code is None:
			
 
				+        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                         f"any country code")
			
 
				+
			
 
				+    if print_info:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    input_files = []
			
 
				+    for item in data_folder.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+            if country_code in folder_mapping:
			
 
				+                country_folders = folder_mapping[country_code]
			
 
				+                if isinstance(country_folders, str):
			
 
				+                    # only one folder
			
 
				+                    country_folders = [country_folders]
			
 
				+
			
 
				+                for country_folder in country_folders:
			
 
				+                    input_folder = item / country_folder / submission
			
 
				+                    if input_folder.exists():
			
 
				+                        for filepath in input_folder.glob("*"):
			
 
				+                            input_files.append(filepath.relative_to(rootpath))
			
 
				+
			
 
				+    if print_info:
			
 
				+        if input_files:
			
 
				+            print(f"Found possible input files:")
			
 
				+            for file in input_files:
			
 
				+                print(file)
			
 
				+        else:
			
 
				+            print(f"No input files found")
			
 
				+
			
 
				+    return input_files
			
 
				+
			
 
				+
			
 
				+def get_possible_outputs(
			
 
				+        country_name: str,
			
 
				+        submission: str,
			
 
				+        print_info: bool = False,
			
 
				+)-> List[Path]:
			
 
				+
			
 
				+    """
			
 
				+    For given country name and submission find the possible output files
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter code
			
 
				+
			
 
				+        submission: str
			
 
				+            String of the submission
			
 
				+
			
 
				+        print_info: bool = False
			
 
				+            If True print information on outputs found
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a list pathlib Path objects for the input files
			
 
				+    """
			
 
				+
			
 
				+    codepath = Path(__file__).parent
			
 
				+    #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
			
 
				+    rootpath = codepath / ".." / ".."
			
 
				+    rootpath = rootpath.resolve()
			
 
				+    data_folder = rootpath / "extracted_data"
			
 
				+
			
 
				+    # obtain country code
			
 
				+    country_code = countrynames.to_code_3(country_name)
			
 
				+    if country_code is None:
			
 
				+        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                         f"any country code")
			
 
				+
			
 
				+    if print_info:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    output_files = []
			
 
				+    for item in data_folder.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            with open(item / "folder_mapping.json", "r") as mapping_file:
			
 
				+                folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+            if country_code in folder_mapping:
			
 
				+                country_folder = folder_mapping[country_code]
			
 
				+                if not isinstance(country_folder, str):
			
 
				+                    raise ValueError("Wrong data type in folder mapping json file. Should be str.")
			
 
				+
			
 
				+                output_folder = item / country_folder
			
 
				+                if output_folder.exists():
			
 
				+                    for filepath in output_folder.glob(country_code + "_" + submission + "*"):
			
 
				+                        output_files.append(filepath.relative_to(rootpath))
			
 
				+
			
 
				+    if print_info:
			
 
				+        if output_files:
			
 
				+            print(f"Found possible output files:")
			
 
				+            for file in output_files:
			
 
				+                print(file)
			
 
				+        else:
			
 
				+            print(f"No output files found")
			
 
				+
			
 
				+    return output_files
			
 
				+
			
 
				+
			
 
				+def get_code_file(
			
 
				+        country_name: str,
			
 
				+        submission: str,
			
 
				+        print_info: bool = False,
			
 
				+) -> Path:
			
 
				+    """
			
 
				+    For given country name and submission find the script that creates the data
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+        country_name: str
			
 
				+            String containing the country name or ISO 3 letter code
			
 
				+
			
 
				+        submission: str
			
 
				+            String of the submission
			
 
				+
			
 
				+        print_info: bool = False
			
 
				+            If True print information on code found
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+        returns a pathlib Path object for the code file
			
 
				+    """
			
 
				+
			
 
				+    codepath = Path(__file__).parent
			
 
				+    #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
			
 
				+    rootpath = codepath / ".." / ".."
			
 
				+    rootpath = rootpath.resolve()
			
 
				+    code_file_path = None
			
 
				+
			
 
				+    # obtain country code
			
 
				+    country_code = countrynames.to_code_3(country_name)
			
 
				+    if country_code is None:
			
 
				+        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                         f"any country code")
			
 
				+
			
 
				+    if print_info:
			
 
				+        print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				+
			
 
				+    with open(codepath / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code not in folder_mapping:
			
 
				+        if print_info:
			
 
				+            print("No code available")
			
 
				+            print("")
			
 
				+    else:
			
 
				+        country_folder = codepath / folder_mapping[country_code]
			
 
				+        code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
			
 
				+
			
 
				+        for file in country_folder.iterdir():
			
 
				+            if file.match(code_file_name_candidate):
			
 
				+                if code_file_path is not None:
			
 
				+                    raise ValueError(f"Found multiple code candidates: "
			
 
				+                                     f"{code_file_path} and file.name. "
			
 
				+                                     f"Please use only one file with name "
			
 
				+                                     f"'read_ISO3_submission_XXX.YYY'.")
			
 
				+                else:
			
 
				+                    if print_info:
			
 
				+                        print(f"Found code file {file.relative_to(rootpath)}")
			
 
				+                code_file_path = file
			
 
				+
			
 
				+    return code_file_path.relative_to(rootpath)
			
 
				+
			
 
				 
			
 
				 def create_folder_mapping(
			
 
				         folder: str,
			
--- a/code/UNFCCC_reader/read_UNFCCC_submission.py
+++ b/code/UNFCCC_reader/read_UNFCCC_submission.py
@@ -2,14 +2,69 @@
 
				 # runs the appropriate script to extract the submission data
			
 
				 
			
 
				 import sys
			
 
				-if len(sys.argv) > 2:
			
 
				+import datalad.api
			
 
				+from pathlib import Path
			
 
				+from get_submissions_info import get_code_file
			
 
				+from get_submissions_info import get_possible_inputs
			
 
				+from get_submissions_info import get_possible_outputs
			
 
				+
			
 
				+
			
 
				+if len(sys.argv) > 3:
			
 
				     raise TypeError('Too many arguments given. '
			
 
				                     'Need exactly two arguments (country, submission)')
			
 
				-elif len(sys.argv) < 2:
			
 
				+elif len(sys.argv) < 3:
			
 
				     raise TypeError('Too few arguments given. '
			
 
				                     'Need exactly two arguments (country, submission)')
			
 
				 
			
 
				-country = sys.argv[0]
			
 
				-submission = sys.argv[1]
			
 
				+country = sys.argv[1]
			
 
				+submission = sys.argv[2]
			
 
				+
			
 
				+codepath = Path(__file__).parent
			
 
				+rootpath = codepath / ".." / ".."
			
 
				+rootpath = rootpath.resolve()
			
 
				+
			
 
				+print(f"Attempting to extract data for {submission} from {country}.")
			
 
				+print("#"*80)
			
 
				+print("")
			
 
				+
			
 
				+# get the correct script
			
 
				+script_name = get_code_file(country, submission)
			
 
				+if script_name:
			
 
				+    print(f"Found code file {script_name}")
			
 
				+    print("")
			
 
				+
			
 
				+    # get possible input files
			
 
				+    input_files = get_possible_inputs(country, submission)
			
 
				+    if not input_files:
			
 
				+        print(f"No possible input files found for {country}, {submission}. "
			
 
				+              f"Something might be wrong here.")
			
 
				+    else:
			
 
				+        print(f"Found the following input_files:")
			
 
				+        for file in input_files:
			
 
				+            print(file)
			
 
				+        print("")
			
 
				+
			
 
				+    # get possible output files
			
 
				+    output_files = get_possible_outputs(country, submission)
			
 
				+    if not output_files:
			
 
				+        print(f"No possible output files found for {country}, {submission}. "
			
 
				+              f"This is either the first run or something is wrong.")
			
 
				+    else:
			
 
				+        print(f"Found the following output_files:")
			
 
				+        for file in output_files:
			
 
				+            print(file)
			
 
				+        print("")
			
 
				 
			
 
				+    print(f"Run the script using datalad run via the python api")
			
 
				+    datalad.api.run(
			
 
				+        cmd=f"./venv/bin/python3.8 {script_name}",
			
 
				+        dataset=rootpath,
			
 
				+        message=f"Read data for {country}, {submission}.",
			
 
				+        inputs=input_files,
			
 
				+        outputs=output_files,
			
 
				+    )
			
 
				+else:
			
 
				+    # no code found.
			
 
				+    print(f"No code found to read {submission} from {country}")
			
 
				+    # TODO write info on available submissions and data
			
 
				 
			
--- a/code/requirements.txt
+++ b/code/requirements.txt
@@ -4,4 +4,4 @@ pandas
 
				 selenium
			
 
				 primap2
			
 
				 countrynames
			
 
				-json
			
 
				+datalad
			
--- a/extracted_data/UNFCCC/folder_mapping.json
+++ b/extracted_data/UNFCCC/folder_mapping.json
@@ -0,0 +1,4 @@
 
				+{
			
 
				+    "KOR": "Republic_of_Korea",
			
 
				+    "CHL": "Chile"
			
 
				+}