Parcourir la source

More work on CRF reading including doit integration

Johannes Gütschow il y a 3 ans
Parent
commit
4e1c101722

+ 1 - 0
.gitignore

@@ -4,3 +4,4 @@ geckodriver.log
 __pycache__
 /JG_test_code/
 .doit.db
+log

+ 161 - 66
code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py

@@ -6,6 +6,7 @@ well as for test-reading to check for new categories etc.
 
 import re
 import json
+import numpy as np
 import pandas as pd
 import xarray as xr
 import primap2 as pm2
@@ -15,7 +16,8 @@ from operator import itemgetter
 from collections import Counter
 from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime
-import crf_specifications as crf
+from . import crf_specifications as crf
+from .utils import downloaded_data_path
 
 
 ### reading functions
@@ -121,8 +123,8 @@ def convert_crf_table_to_pm2if(
 
     meta_data = {
         "references": f"https://unfccc.int/ghg-inventories-annex-i-parties/{submission_year}",
-        "rights": "XXXX",
-        "contact": "johannes.guetschow@pik-potsdam.de",
+        "rights": "",
+        "contact": "mail@johannes-guetschow.de",
         "title": f"Data submitted in {submission_year} to the UNFCCC in the common reporting format (CRF)",
         "comment": "Read fom xlsx file by Johannes Gütschow",
         "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
@@ -200,67 +202,11 @@ def read_crf_table(
         country_codes = [country_codes]
 
     # get file names and locations
-    # we're filtering for country and submission year here but in the repository setup
-    # we should only have files for one country and submission in the folder. But the
-    # function can also be used on a given folder and then the filter is useful.
-    input_files = []
-    if folder is None:
-        root = Path(__file__).parents[3]
-        #root = Path(os.getcwd()).parents
-        data_folder = root / "downloaded_data" / "UNFCCC"
-        submission_folder = f"CRF{submission_year}"
-
-        with open(data_folder / "folder_mapping.json", "r") as mapping_file:
-            folder_mapping = json.load(mapping_file)
-
-        # use country default folders
-        country_folders = []
-        for country_code in country_codes:
-            if country_code in folder_mapping:
-                new_country_folders = folder_mapping[country_code]
-                if isinstance(new_country_folders, str):
-                    # only one folder
-                    country_folders = country_folders + \
-                                      [data_folder / new_country_folders / submission_folder]
-                else:
-                    country_folders = country_folders + \
-                                      [data_folder / folder / submission_folder
-                                       for folder in new_country_folders]
-            else:
-                raise ValueError(f"No data folder found for country {country_code}. "
-                                 f"Check if folder mapping is up to date.")
-    else:
-        country_folders = [folder]
-
-    file_filter_template = {}
-    file_filter_template["submission_year"] = submission_year
-    file_filter_template["party"] = country_codes
-    if data_year is not None:
-        file_filter_template["data_year"] = data_year
-
-    for input_folder in country_folders:
-        input_folder = Path(input_folder)
-        if input_folder.exists():
-            # if desired find the latest date and only read that
-            # has to be done per country
-            if date == "latest":
-                for country in country_codes:
-                    file_filter = file_filter_template.copy()
-                    file_filter["party"] = country
-                    dates = get_submission_dates(folder, file_filter)
-                    file_filter["date"] = find_latest_date(dates)
-                    input_files = input_files + \
-                                  filter_filenames(input_folder.glob("*.xlsx"),
-                                                   **file_filter)
-            else:
-                file_filter = file_filter_template.copy()
-                if date is not None:
-                    file_filter["date"] = date
-                input_files = input_files + \
-                              filter_filenames(input_folder.glob("*.xlsx"),
-                                               **file_filter)
-        else:
-            raise ValueError(f"Folder {input_folder} does not exist")
+    input_files = get_crf_files(country_codes=country_codes,
+                                submission_year=submission_year,
+                                data_year=data_year,
+                                date=date,
+                                folder=folder)
 
     # get specification
     try:
@@ -535,6 +481,106 @@ def read_crf_table_from_file(
     return df_long, unknown_categories, info_last_row
 
 
+def get_crf_files(
+        country_codes: Union[str, List[str]],
+        submission_year: int,
+        data_year: Optional[Union[int, List[int]]] = None,
+        date: Optional[str] = None,
+        folder: Optional[str] = None,
+) -> List[Path]:
+    """
+    Finds all files according to given parameters
+
+    Parameters
+    __________
+
+    country_codes: str or list[str]
+        ISO 3-letter country code or list of country codes
+
+    submission_year: int
+        Year of the submission of the data
+
+    data_year: int or List of int (optional)
+        if int a single data year will be read. if a list of ints is given these
+        years will be read. If no nothing is given all data years will be read
+
+    date: str (optional, default is "latest")
+        readonly submission from the given date
+
+    folder: str (optional)
+        Folder that contains the xls files. If not given fodlers are determined by the
+        submissions_year and country_code variables
+
+    Returns
+    _______
+        List[Path]: list of Path objects for the files
+    """
+    if isinstance(country_codes, str):
+        country_codes = [country_codes]
+    input_files = []
+    # get file names and locations
+    # we're filtering for country and submission year here but in the repository setup
+    # we should only have files for one country and submission in the folder. But the
+    # function can also be used on a given folder and then the filter is useful.
+    if folder is None:
+        data_folder = downloaded_data_path
+        submission_folder = f"CRF{submission_year}"
+
+        with open(data_folder / "folder_mapping.json", "r") as mapping_file:
+            folder_mapping = json.load(mapping_file)
+
+        # use country default folders
+        country_folders = []
+        for country_code in country_codes:
+            if country_code in folder_mapping:
+                new_country_folders = folder_mapping[country_code]
+                if isinstance(new_country_folders, str):
+                    # only one folder
+                    country_folders = country_folders + \
+                                      [data_folder / new_country_folders / submission_folder]
+                else:
+                    country_folders = country_folders + \
+                                      [data_folder / folder / submission_folder
+                                       for folder in new_country_folders]
+            else:
+                raise ValueError(f"No data folder found for country {country_code}. "
+                                 f"Check if folder mapping is up to date.")
+    else:
+        country_folders = [folder]
+
+    file_filter_template = {}
+    file_filter_template["submission_year"] = submission_year
+    file_filter_template["party"] = country_codes
+    if data_year is not None:
+        file_filter_template["data_year"] = data_year
+
+    for input_folder in country_folders:
+        input_folder = Path(input_folder)
+        if input_folder.exists():
+            # if desired find the latest date and only read that
+            # has to be done per country
+            if date == "latest":
+                for country in country_codes:
+                    file_filter = file_filter_template.copy()
+                    file_filter["party"] = country
+                    dates = get_submission_dates(folder, file_filter)
+                    file_filter["date"] = find_latest_date(dates)
+                    input_files = input_files + \
+                                  filter_filenames(input_folder.glob("*.xlsx"),
+                                                   **file_filter)
+            else:
+                file_filter = file_filter_template.copy()
+                if date is not None:
+                    file_filter["date"] = date
+                input_files = input_files + \
+                              filter_filenames(input_folder.glob("*.xlsx"),
+                                               **file_filter)
+        else:
+            raise ValueError(f"Folder {input_folder} does not exist")
+
+    return input_files
+
+
 def get_info_from_crf_filename(
         filename: str,
 ) -> Dict[str, Union[int, str]]:
@@ -814,6 +860,51 @@ def filter_category(
     return new_mapping
 
 
+def get_latest_date_for_country(
+        country_code: str,
+        submission_year: int,
+)->str:
+    """
+    Find the latest submission date for a country
+
+    Parameters
+    __________
+    country: str
+        3-letter country code
+
+    submission_year: int
+        Year of the submission to find the l;atest date for
+
+    Returns
+    _______
+        str: string with date
+    """
+
+    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
+        folder_mapping = json.load(mapping_file)
+
+    if country_code in folder_mapping:
+        file_filter = {}
+        file_filter["party"] = country_code
+        file_filter["submission_year"] = submission_year
+        country_folders = folder_mapping[country_code]
+        if isinstance(country_folders, str):
+            # only one folder
+            submission_date = find_latest_date(get_submission_dates(
+                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
+        else:
+            dates = []
+            for folder in country_folders:
+                dates = dates + get_submission_dates(
+                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
+            submission_date = find_latest_date(dates)
+    else:
+        raise ValueError(f"No data folder found for country {country_code}. "
+                         f"Check if folder mapping is up to date.")
+
+    return submission_date
+
+
 def get_submission_dates(
         folder: Path,
         file_filter: Dict[str, Union[str, int, List]],
@@ -840,6 +931,7 @@ def get_submission_dates(
                          f"the function's purpose is to return available dates.")
 
     if folder.exists():
+        print(folder)
         files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
     else:
         raise ValueError(f"Folder {folder} does not exist")
@@ -903,7 +995,10 @@ def find_latest_date(
         str: latest date
     """
 
-    dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
-    dates_datetime = sorted(dates_datetime, key=itemgetter(1))
+    if len(dates) > 0:
+        dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
+        dates_datetime = sorted(dates_datetime, key=itemgetter(1))
+    else:
+        raise ValueError(f"Passed list of dates is empty")
 
     return dates_datetime[-1][0]

+ 9 - 6
code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py

@@ -52,9 +52,11 @@ def save_unknown_categories_info(
                     countries_cat = f"{countries_cat}; {country} ({years_country})"
             processed_cats.append([table, cat, countries_cat])
 
-    folder = file.parents[0]
-    if not folder.exists:
-        folder.mkdir()
+
+    if not file.parents[1].exists():
+        file.parents[1].mkdir()
+    if not file.parents[0].exists():
+        file.parents[0].mkdir()
     df_processed_cats = pd.DataFrame(processed_cats, columns=["Table", "Category", "Countries"])
     df_processed_cats.to_csv(file, index=False)
 
@@ -103,8 +105,9 @@ def save_last_row_info(
                     cats_country = f"{cats_country}; {cat} ({years_category})"
             processed_last_row_info.append([table, country, cats_country])
 
-    folder = file.parents[0]
-    if not folder.exists:
-        folder.mkdir()
+    if not file.parents[1].exists():
+        file.parents[1].mkdir()
+    if not file.parents[0].exists():
+        file.parents[0].mkdir()
     df_processed_lost_row_info = pd.DataFrame(processed_last_row_info, columns=["Table", "Country", "Categories"])
     df_processed_lost_row_info.to_csv("test_last_row_info.csv", index=False)

+ 153 - 47
code/UNFCCC_CRF_reader/UNFCCC_CRF_reader.py → code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py

@@ -1,34 +1,42 @@
-import re
-from pathlib import Path
-from treelib import Tree
+#import re
+#
+#from treelib import Tree
+
 
-import pandas as pd
+#import pandas as pd
 import xarray as xr
 import primap2 as pm2
+import numpy as np
 import pycountry
-import crf_specifications as crf
-from typing import Dict, List, Optional, Tuple, Union
+import datalad.api
 from datetime import date
+from pathlib import Path
+from typing import Optional
+
+from . import crf_specifications as crf
 
 from .UNFCCC_CRF_reader_core import read_crf_table
 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
+from .UNFCCC_CRF_reader_core import get_latest_date_for_country
+from .UNFCCC_CRF_reader_core import get_crf_files
 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
 from .UNFCCC_CRF_reader_devel import save_last_row_info
 
-from . import log_path, custom_country_mapping, extracted_data_path
+from .utils import code_path, log_path, \
+    custom_country_mapping, extracted_data_path, root_path
+
+import sys
+sys.path.append('../UNFCCC_reader')
+from UNFCCC_reader.get_submissions_info import get_country_code
+
 
 # functions:
-# * production functions
-# ** read one table for a country
-# ** read alist of tables for one country
-# ** convert to IF and NC and save
 # * testing fucntions
 # ** read one or more table(s) for all countries
 #    (and a if desired only a single year) and write
 #    output files with missing sectors etc
 # **
 
-# TODO: add saving to read_crf_for_country
 # TODO: add function to read several / all countries
 
 
@@ -44,6 +52,7 @@ from . import log_path, custom_country_mapping, extracted_data_path
 def read_crf_for_country(
         country_code: str,
         submission_year: int,
+        submission_date: Optional[str]=None,
 ) -> xr.Dataset:
     """
     Read CRF data for given submission year and country. All tables
@@ -77,23 +86,17 @@ def read_crf_for_country(
     submission_year: int
         Year of the submission of the data
 
+    submission_data: Optional(str)
+        Read for a specific submission date (given as string as in the file names)
+        If not specified latest data will be read
+
     Returns
     _______
-        first return value is a Pandas DataFrame with the read data in long format
-        second return value
-        third return value TODO
-
+        return value is a Pandas DataFrame with the read data in PRIMAP2 format
     """
+
     # get country name
-    if country_code in custom_country_mapping:
-        country_name = custom_country_mapping(country_code)
-    else:
-        try:
-            country = pycountry.countries.get(alpha_3=country_code)
-            country_name = country.name
-        except:
-            raise ValueError(f"Country code {country_code} can not be mapped to "
-                             f"any country")
+    country_name = get_country_name(country_code)
 
     # get specification and available tables
     try:
@@ -107,8 +110,8 @@ def read_crf_for_country(
     print(f"The following tables are available in the " \
           f"CRF{submission_year} specification: {tables}")
 
-    # TODO: get available dates (first get folders for country, then dates, select latest date and passt on)
-    # dates need to be determined here.
+    if submission_date is None:
+        submission_date = get_latest_date_for_country(country_code, submission_year)
 
     ds_all = None
     unknown_categories = []
@@ -116,7 +119,7 @@ def read_crf_for_country(
     for table in tables:
         # read table for all years
         ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
-            country_code, table, submission_year, folder="CRF2021")#, data_year=[1990])
+            country_code, table, submission_year, date=submission_date)#, data_year=[1990])
 
         # collect messages on unknown rows etc
         unknown_categories = unknown_categories + new_unknown_categories
@@ -135,7 +138,9 @@ def read_crf_for_country(
         ds_table_if = convert_crf_table_to_pm2if(
             ds_table,
             2021,
-            meta_data_input={"title": "DEU"},
+            meta_data_input={"title": f"Data submitted in {submission_year} to the UNFCCC "
+                                      f"in the common reporting format (CRF) by {country_name}. "
+                                      f"Submission date: {submission_date}"},
             entity_mapping=entity_mapping,
         )
 
@@ -146,14 +151,7 @@ def read_crf_for_country(
         if ds_all is None:
             ds_all = ds_table_pm2
         else:
-            ds_all = xr.combine_by_coords(data_objects=[ds_all, ds_table_pm2],
-                                          compat='override',
-                                          data_vars='all',
-                                          coords='all',
-                                          fill_value=np.nan,
-                                          #join='outer',
-                                          combine_attrs='drop_conflicts'
-                                          )
+            ds_all = ds_all.combine_first(ds_table_pm2)
 
     # check if there were log messages.
     save_data = True
@@ -161,7 +159,7 @@ def read_crf_for_country(
         save_data = False
         today = date.today()
         log_location = log_path / f"CRF{submission_year}" \
-                       / f"{country_code}_unknown_categories_{today.strftime('%d/%m/%Y')}.csv"
+                       / f"{country_code}_unknown_categories_{today.strftime('%Y-%m-%d')}.csv"
         print(f"Unknown rows found for {country_code}. Not saving data. Savin log to "
               f"{log_location}" )
         save_unknown_categories_info(unknown_categories, log_location)
@@ -170,29 +168,137 @@ def read_crf_for_country(
         save_data = False
         today = date.today()
         log_location = log_path / f"CRF{submission_year}" \
-                       / f"{country_code}_last_row_info_{today.strftime('%d/%m/%Y')}.csv"
+                       / f"{country_code}_last_row_info_{today.strftime('%Y-%m-%d')}.csv"
         print(f"Data found in the last row found for {country_code}. Not saving data. Savin log to "
               f"{log_location}")
         save_last_row_info(last_row_info, log_location)
 
     if save_data:
+        compression = dict(zlib=True, complevel=9)
         output_folder = extracted_data_path / country_name.replace(" ", "_")
-        output_filename = f"{country_code}_CRF{submission_year}_
-
-# TODO: need to consider the date when reading, there might be multiple submissions...
+        output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
 
         if not output_folder.exists():
             output_folder.mkdir()
+            # folder mapping has to be updated !!!
+            # if we do it here we will do it a lot of times when reading several countries at once
 
-        # write data in interchnange formart
-        pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
+        # write data in interchange format
+        pm2.pm2io.write_interchange_format(output_folder / output_filename,
+                                           ds_all.pr.to_interchange_format())
 
         # write data in native PRIAMP2 formart
-        data_pm2 = pm2.pm2io.from_interchange_format(data_if)
-        encoding = {var: compression for var in data_pm2.data_vars}
-        data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
+        encoding = {var: compression for var in ds_all.data_vars}
+        ds_all.pr.to_netcdf(output_folder / (output_filename + ".nc"),
                               encoding=encoding)
 
     return ds_all
 
 
+def read_crf_for_country_datalad(
+        country: str,
+        submission_year: int,
+        submission_date: Optional[str]=None,
+) -> None:
+    """
+    Wrapper around read_crf_for_country which takes care of selecting input
+    and output files and using datalad run to trigger the data reading
+
+    Parameters
+    __________
+
+    country_codes: str
+        ISO 3-letter country code
+
+    submission_year: int
+        Year of the submission of the data
+
+    submission_date: Optional(str)
+        Read for a specific submission date (given as string as in the file names)
+        If not specified latest data will be read
+
+    """
+
+    # get the country code and name
+    # both could be given as input, so we need this two step process
+    if country in custom_country_mapping:
+        country_code = country
+    else:
+        country_code = get_country_code(country)
+    # now get the country name
+    country_name = get_country_name(country_code)
+
+    print(f"Attempting to read data for CRF{submission_year} from {country}.")
+    print("#"*80)
+    print("")
+
+    print(f"Using the UNFCCC_CRF_reader")
+    print("")
+
+    # get possible input files
+    input_files = get_crf_files(country_codes=country_code,
+                                submission_year=submission_year,
+                                date=submission_date)
+    if not input_files:
+        if submission_date is not None:
+            print(f"No possible input files found for {country}, CRF{submission_year}, "
+                  f"v{submission_date}. Are they already submitted and included in the "
+                  f"repository?")
+        else:
+            print(f"No possible input files found for {country}, CRF{submission_year}. "
+                  f"Are they already submitted and included in the repository?")
+    else:
+        print(f"Found the following input_files:")
+        for file in input_files:
+            print(file.name)
+        print("")
+
+    # convert file's path to str
+    input_files = [file.as_posix() for file in input_files]
+
+    # get output file
+    if submission_date is None:
+        submission_date = get_latest_date_for_country(country_code, submission_year)
+
+    output_folder = extracted_data_path / country_name.replace(" ", "_")
+    output_files = [output_folder / f"{country_code}_CRF{submission_year}"
+                    f"_{submission_date}.{suffix}" for suffix
+                    in ['yaml', 'csv', 'nc']]
+    print(f"The following files are considered as output_files:")
+    for file in output_files:
+        print(file)
+    print("")
+
+    # convert file paths to str
+    output_files = [file.as_posix() for file in output_files]
+
+    print(f"Run the script using datalad run via the python api")
+    script = code_path / "UNFCCC_CRF_reader" / "read_UNFCCC_CRF_submission.py"
+    datalad.api.run(
+        cmd=f"./venv/bin/python3 {script.name} --country={country} "
+            f"--submission_year={submission_year} --submission_date=={submission_date}",
+        dataset=root_path,
+        message=f"Read data for {country}, CRF{submission_year}, {submission_date}.",
+        inputs=input_files,
+        outputs=output_files,
+        dry_run=None,
+        explicit=True,
+    )
+
+
+
+def get_country_name(
+        country_code: str,
+) -> str:
+    """get country name from code """
+    if country_code in custom_country_mapping:
+        country_name = custom_country_mapping(country_code)
+    else:
+        try:
+            country = pycountry.countries.get(alpha_3=country_code)
+            country_name = country.name
+        except:
+            raise ValueError(f"Country code {country_code} can not be mapped to "
+                             f"any country")
+
+    return country_name

+ 2 - 15
code/UNFCCC_CRF_reader/__init__.py

@@ -3,19 +3,6 @@ CRF reader module
 """
 
 from pathlib import Path
+#from . import crf_specifications
+from .UNFCCC_CRF_reader_prod import read_crf_for_country, read_crf_for_country_datalad
 
-from .UNFCCC_CRF_reader import read_crf_for_country
-
-root_path = Path(__file__).parents[3]
-log_path = root_path / "log"
-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
-extracted_data_path = root_path / ""
-
-custom_country_mapping = {
-    "EUA": "European Union",
-    "EUC": "European Union",
-    "FRK": "France",
-    "DKE": "Denmark",
-    "DNM": "Denmark",
-    "GBK": "United Kingdom",
-}

+ 1 - 1
code/UNFCCC_CRF_reader/crf_specifications/CRF2021_specification.py

@@ -1717,7 +1717,7 @@ CRF2021 = {
         "sector_mapping": [
             ['Total actual emissions of halocarbons (by chemical) and SF6', ['2']],
             ['B. Chemical industry', ['2.B']],
-            ['9. Flurochemical production', ['2.9']],
+            ['9. Flurochemical production', ['2.B.9']],
             ['By-product emissions', ['2.B.9.a']],
             ['Fugitive emissions', ['2.B.9.b']],
             ['10. Other', ['2.B.10']],

+ 6 - 0
code/UNFCCC_CRF_reader/crf_specifications/CRF2022_specification.py

@@ -0,0 +1,6 @@
+#import numpy as np
+#from .util import unit_info
+
+from .CRF2021_specification import CRF2021
+
+CRF2022 = CRF2021

+ 1 - 1
code/UNFCCC_CRF_reader/crf_specifications/__init__.py

@@ -3,4 +3,4 @@ Define the CRF specifications here for easy access
 """
 
 from .CRF2021_specification import CRF2021
-
+from .CRF2022_specification import CRF2022

+ 24 - 0
code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission.py

@@ -0,0 +1,24 @@
+"""
+This script is a wrapper around the read_crf_for_country
+function such that it can be called from datalad
+"""
+
+from . import read_crf_for_country
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country name or code')
+parser.add_argument('--submission_year', help='Submission round to read')
+parser.add_argument('--submission_date', help='Date of submission to read', default=None)
+
+args = parser.parse_args()
+
+country = args.country
+submission_year = args.submission_year
+submission_date = args.submission_date
+
+read_crf_for_country(
+    country,
+    submission_year=submission_year,
+    submission_date=submission_date)
+

+ 24 - 0
code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission_datalad.py

@@ -0,0 +1,24 @@
+"""
+wrapper around read_crf_for_country_datalad such that it can be called
+from doit in the current setup where doit runs on system python and
+not in the venv.
+"""
+
+from . import read_crf_for_country_datalad
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--country', help='Country name or code')
+parser.add_argument('--submission_year', help='Submission round to read')
+parser.add_argument('--submission_date', help='Date of submission to read', default=None)
+
+args = parser.parse_args()
+
+country = args.country
+submission_year = args.submission_year
+submission_date = args.submission_date
+
+read_crf_for_country_datalad(
+        country,
+        submission_year=submission_year,
+        submission_date=submission_date)

+ 30 - 0
code/UNFCCC_CRF_reader/utils.py

@@ -0,0 +1,30 @@
+from pathlib import Path
+
+# 4 for use from nbs, fix
+root_path = Path(__file__).parents[3].absolute()
+log_path = root_path / "log"
+code_path = root_path / "code"
+downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
+extracted_data_path = root_path / "extracted_data" / "UNFCCC"
+
+custom_country_mapping = {
+    "EUA": "European Union",
+    "EUC": "European Union",
+    "FRK": "France",
+    "DKE": "Denmark",
+    "DNM": "Denmark",
+    "GBK": "United Kingdom",
+}
+
+all_crf_countries = [
+    'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
+    'CAN', 'CHE', 'CYP', 'CZE', 'DEU', # 10
+    'DKE', 'DNK', 'DNM', 'ESP', 'EST',
+    'EUA', 'EUC', 'FIN', 'FRA', 'FRK', # 20
+    'GBK', 'GBR', 'GRC', 'HRV', 'HUN',
+    'IRL', 'ISL', 'ITA', 'JPN', 'KAZ', # 30
+    'LIE', 'LTU', 'LUX', 'LVA', 'MCO',
+    'MLT', 'NLD', 'NOR', 'NZL', 'POL', # 40
+    'PRT', 'ROU', 'RUS', 'SVK', 'SVN',
+    'SWE', 'TUR', 'UKR', 'USA', # 49
+]

+ 60 - 30
code/UNFCCC_reader/get_submissions_info.py

@@ -35,18 +35,7 @@ def get_country_submissions(
     codepath = Path(__file__).parent
     data_folder = codepath / ".." / ".." / "downloaded_data"
 
-    # obtain country code
-    #country_code = countrynames.to_code_3(country_name)
-    try:
-        country = pycountry.countries.search_fuzzy(country_name)
-    except:
-        raise ValueError(f"Country name {country_name} can not be mapped to "
-                         f"any country code")
-    if len(country) > 1:
-        raise ValueError(f"Country name {country_name} has {len(country)} "
-                         f"possible results for country codes.")
-
-    country_code = country[0].alpha_3
+    country_code = get_country_code(country_name)
 
     if print_sub:
         print(f"Country name {country_name} maps to ISO code {country_code}")
@@ -122,10 +111,7 @@ def get_country_datasets(
 
 
     # obtain country code
-    country_code = countrynames.to_code_3(country_name)
-    if country_code is None:
-        raise ValueError(f"Country name {country_name} can not be mapped to "
-                         f"any country code")
+    country_code = get_country_code(country_name)
 
     if print_ds:
         print(f"Country name {country_name} maps to ISO code {country_code}")
@@ -270,6 +256,43 @@ def get_country_datasets(
     return all_data
 
 
+def get_country_code(
+        country_name: str,
+)->str:
+    """
+    obtain country code. If the input is a code it will be returned, if the input
+    is not a three letter code a search will be performed
+
+    Parameters
+    __________
+    country_name: str
+        Country code or name to get the three-letter code for.
+
+    """
+    try:
+        # check if it's a 3 letter code
+        country = pycountry.countries.get(alpha_3=country_name)
+        country_code = country.alpha_3
+    except:
+        try:
+            country = pycountry.countries.search_fuzzy(country_name)
+        except:
+            raise ValueError(f"Country name {country_name} can not be mapped to "
+                             f"any country code")
+        if len(country) > 1:
+            country_code = None
+            for current_country in country:
+                if current_country.name == country_name:
+                    country_code = current_country.alpha_3
+            if country_code is None:
+                raise ValueError(f"Country name {country_name} has {len(country)} "
+                                 f"possible results for country codes.")
+
+        country_code = country[0].alpha_3
+
+    return country_code
+
+
 def get_possible_inputs(
         country_name: str,
         submission: str,
@@ -302,10 +325,7 @@ def get_possible_inputs(
     data_folder = rootpath / "downloaded_data"
 
     # obtain country code
-    country_code = countrynames.to_code_3(country_name)
-    if country_code is None:
-        raise ValueError(f"Country name {country_name} can not be mapped to "
-                         f"any country code")
+    country_code = get_country_code(country_name)
 
     if print_info:
         print(f"Country name {country_name} maps to ISO code {country_code}")
@@ -371,11 +391,7 @@ def get_possible_outputs(
     data_folder = rootpath / "extracted_data"
 
     # obtain country code
-    country_code = countrynames.to_code_3(country_name)
-    if country_code is None:
-        raise ValueError(f"Country name {country_name} can not be mapped to "
-                         f"any country code")
-
+    country_code = get_country_code(country_name)
     if print_info:
         print(f"Country name {country_name} maps to ISO code {country_code}")
 
@@ -436,11 +452,13 @@ def get_code_file(
     rootpath = rootpath.resolve()
     code_file_path = None
 
+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
+    # so we return the path to that.
+    if submission[0:3] == "CRF":
+        return rootpath / "UNFCCC_CRF_reader"
+
     # obtain country code
-    country_code = countrynames.to_code_3(country_name)
-    if country_code is None:
-        raise ValueError(f"Country name {country_name} can not be mapped to "
-                         f"any country code")
+    country_code = get_country_code(country_name)
 
     if print_info:
         print(f"Country name {country_name} maps to ISO code {country_code}")
@@ -473,6 +491,7 @@ def get_code_file(
     else:
         return None
 
+
 def create_folder_mapping(
         folder: str,
         extracted: bool = False
@@ -514,7 +533,18 @@ def create_folder_mapping(
 
     for item in folder.iterdir():
         if item.is_dir():
-            ISO3 = countrynames.to_code_3(item.name)
+            try:
+                country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
+                if len(country) > 1:
+                    ISO3 = None
+                    for current_country in country:
+                        if current_country.name == item.name.replace("_", " "):
+                            ISO3 = current_country.alpha_3
+                else:
+                    ISO3 = country[0].alpha_3
+            except:
+                ISO3 = None
+
             if ISO3 is None:
                 if item.name not in known_folders:
                     print(folder_mapping.values())

+ 1 - 0
code/UNFCCC_reader/read_UNFCCC_submission.py

@@ -32,6 +32,7 @@ print("")
 
 # get the correct script
 script_name = get_code_file(country, submission)
+
 if script_name is not None:
     print(f"Found code file {script_name}")
     print("")

+ 20 - 4
dodo.py

@@ -72,8 +72,6 @@ def task_update_nc():
     }
 
 
-
-
 def task_download_nc():
     """ Download NC submissions """
     return {
@@ -143,9 +141,8 @@ read_config = {
     "submission": get_var('submission', None),
 }
 
-
 def task_read_unfccc_submission():
-    """ Read submission for a country (if code exists) """
+    """ Read submission for a country (if code exists) (not for CRF)"""
     return {
         'actions': [f"./venv/bin/python code/UNFCCC_reader/read_UNFCCC_submission.py "
                     f"--country={read_config['country']} --submission={read_config['submission']}"],
@@ -153,6 +150,25 @@ def task_read_unfccc_submission():
         'setup': ['setup_venv'],
     }
 
+# read UNFCCC submissions.
+# datalad run is called from within the read_UNFCCC_submission.py script
+read_config_crf = {
+    "country": get_var('country', None),
+    "submission_year": get_var('submission_year', None),
+    "submission_date": get_var('submission_date', None),
+}
+
+def task_read_unfccc_crf_submission():
+    """ Read CRF submission for a country """
+    return {
+        'actions': [f"./venv/bin/python code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission_datalad.py "
+                    f"--country={read_config_crf['country']} "
+                    f"--submission_year={read_config_crf['submission_year']} "
+                    f"--submission_date={read_config['submission_date']}"],
+        'verbosity': 2,
+        'setup': ['setup_venv'],
+    }
+
 
 def task_country_info():
     """ Print information on submissions and datasets