il y a 3 ans · 4e1c101722
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ geckodriver.log
 
				 __pycache__
			
 
				 /JG_test_code/
			
 
				 .doit.db
			
 
				+log
			
--- a/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
+++ b/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_core.py
@@ -6,6 +6,7 @@ well as for test-reading to check for new categories etc.
 
				 
			
 
				 import re
			
 
				 import json
			
 
				+import numpy as np
			
 
				 import pandas as pd
			
 
				 import xarray as xr
			
 
				 import primap2 as pm2
			
@@ -15,7 +16,8 @@ from operator import itemgetter
 
				 from collections import Counter
			
 
				 from typing import Dict, List, Optional, Tuple, Union
			
 
				 from datetime import datetime
			
 
				-import crf_specifications as crf
			
 
				+from . import crf_specifications as crf
			
 
				+from .utils import downloaded_data_path
			
 
				 
			
 
				 
			
 
				 ### reading functions
			
@@ -121,8 +123,8 @@ def convert_crf_table_to_pm2if(
 
				 
			
 
				     meta_data = {
			
 
				         "references": f"https://unfccc.int/ghg-inventories-annex-i-parties/{submission_year}",
			
 
				-        "rights": "XXXX",
			
 
				-        "contact": "johannes.guetschow@pik-potsdam.de",
			
 
				+        "rights": "",
			
 
				+        "contact": "mail@johannes-guetschow.de",
			
 
				         "title": f"Data submitted in {submission_year} to the UNFCCC in the common reporting format (CRF)",
			
 
				         "comment": "Read fom xlsx file by Johannes Gütschow",
			
 
				         "institution": "United Nations Framework Convention on Climate Change (www.unfccc.int)",
			
@@ -200,67 +202,11 @@ def read_crf_table(
 
				         country_codes = [country_codes]
			
 
				 
			
 
				     # get file names and locations
			
 
				-    # we're filtering for country and submission year here but in the repository setup
			
 
				-    # we should only have files for one country and submission in the folder. But the
			
 
				-    # function can also be used on a given folder and then the filter is useful.
			
 
				-    input_files = []
			
 
				-    if folder is None:
			
 
				-        root = Path(__file__).parents[3]
			
 
				-        #root = Path(os.getcwd()).parents
			
 
				-        data_folder = root / "downloaded_data" / "UNFCCC"
			
 
				-        submission_folder = f"CRF{submission_year}"
			
 
				-
			
 
				-        with open(data_folder / "folder_mapping.json", "r") as mapping_file:
			
 
				-            folder_mapping = json.load(mapping_file)
			
 
				-
			
 
				-        # use country default folders
			
 
				-        country_folders = []
			
 
				-        for country_code in country_codes:
			
 
				-            if country_code in folder_mapping:
			
 
				-                new_country_folders = folder_mapping[country_code]
			
 
				-                if isinstance(new_country_folders, str):
			
 
				-                    # only one folder
			
 
				-                    country_folders = country_folders + \
			
 
				-                                      [data_folder / new_country_folders / submission_folder]
			
 
				-                else:
			
 
				-                    country_folders = country_folders + \
			
 
				-                                      [data_folder / folder / submission_folder
			
 
				-                                       for folder in new_country_folders]
			
 
				-            else:
			
 
				-                raise ValueError(f"No data folder found for country {country_code}. "
			
 
				-                                 f"Check if folder mapping is up to date.")
			
 
				-    else:
			
 
				-        country_folders = [folder]
			
 
				-
			
 
				-    file_filter_template = {}
			
 
				-    file_filter_template["submission_year"] = submission_year
			
 
				-    file_filter_template["party"] = country_codes
			
 
				-    if data_year is not None:
			
 
				-        file_filter_template["data_year"] = data_year
			
 
				-
			
 
				-    for input_folder in country_folders:
			
 
				-        input_folder = Path(input_folder)
			
 
				-        if input_folder.exists():
			
 
				-            # if desired find the latest date and only read that
			
 
				-            # has to be done per country
			
 
				-            if date == "latest":
			
 
				-                for country in country_codes:
			
 
				-                    file_filter = file_filter_template.copy()
			
 
				-                    file_filter["party"] = country
			
 
				-                    dates = get_submission_dates(folder, file_filter)
			
 
				-                    file_filter["date"] = find_latest_date(dates)
			
 
				-                    input_files = input_files + \
			
 
				-                                  filter_filenames(input_folder.glob("*.xlsx"),
			
 
				-                                                   **file_filter)
			
 
				-            else:
			
 
				-                file_filter = file_filter_template.copy()
			
 
				-                if date is not None:
			
 
				-                    file_filter["date"] = date
			
 
				-                input_files = input_files + \
			
 
				-                              filter_filenames(input_folder.glob("*.xlsx"),
			
 
				-                                               **file_filter)
			
 
				-        else:
			
 
				-            raise ValueError(f"Folder {input_folder} does not exist")
			
 
				+    input_files = get_crf_files(country_codes=country_codes,
			
 
				+                                submission_year=submission_year,
			
 
				+                                data_year=data_year,
			
 
				+                                date=date,
			
 
				+                                folder=folder)
			
 
				 
			
 
				     # get specification
			
 
				     try:
			
@@ -535,6 +481,106 @@ def read_crf_table_from_file(
 
				     return df_long, unknown_categories, info_last_row
			
 
				 
			
 
				 
			
 
				+def get_crf_files(
			
 
				+        country_codes: Union[str, List[str]],
			
 
				+        submission_year: int,
			
 
				+        data_year: Optional[Union[int, List[int]]] = None,
			
 
				+        date: Optional[str] = None,
			
 
				+        folder: Optional[str] = None,
			
 
				+) -> List[Path]:
			
 
				+    """
			
 
				+    Finds all files according to given parameters
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+
			
 
				+    country_codes: str or list[str]
			
 
				+        ISO 3-letter country code or list of country codes
			
 
				+
			
 
				+    submission_year: int
			
 
				+        Year of the submission of the data
			
 
				+
			
 
				+    data_year: int or List of int (optional)
			
 
				+        if int a single data year will be read. if a list of ints is given these
			
 
				+        years will be read. If no nothing is given all data years will be read
			
 
				+
			
 
				+    date: str (optional, default is "latest")
			
 
				+        readonly submission from the given date
			
 
				+
			
 
				+    folder: str (optional)
			
 
				+        Folder that contains the xls files. If not given fodlers are determined by the
			
 
				+        submissions_year and country_code variables
			
 
				+
			
 
				+    Returns
			
 
				+    _______
			
 
				+        List[Path]: list of Path objects for the files
			
 
				+    """
			
 
				+    if isinstance(country_codes, str):
			
 
				+        country_codes = [country_codes]
			
 
				+    input_files = []
			
 
				+    # get file names and locations
			
 
				+    # we're filtering for country and submission year here but in the repository setup
			
 
				+    # we should only have files for one country and submission in the folder. But the
			
 
				+    # function can also be used on a given folder and then the filter is useful.
			
 
				+    if folder is None:
			
 
				+        data_folder = downloaded_data_path
			
 
				+        submission_folder = f"CRF{submission_year}"
			
 
				+
			
 
				+        with open(data_folder / "folder_mapping.json", "r") as mapping_file:
			
 
				+            folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+        # use country default folders
			
 
				+        country_folders = []
			
 
				+        for country_code in country_codes:
			
 
				+            if country_code in folder_mapping:
			
 
				+                new_country_folders = folder_mapping[country_code]
			
 
				+                if isinstance(new_country_folders, str):
			
 
				+                    # only one folder
			
 
				+                    country_folders = country_folders + \
			
 
				+                                      [data_folder / new_country_folders / submission_folder]
			
 
				+                else:
			
 
				+                    country_folders = country_folders + \
			
 
				+                                      [data_folder / folder / submission_folder
			
 
				+                                       for folder in new_country_folders]
			
 
				+            else:
			
 
				+                raise ValueError(f"No data folder found for country {country_code}. "
			
 
				+                                 f"Check if folder mapping is up to date.")
			
 
				+    else:
			
 
				+        country_folders = [folder]
			
 
				+
			
 
				+    file_filter_template = {}
			
 
				+    file_filter_template["submission_year"] = submission_year
			
 
				+    file_filter_template["party"] = country_codes
			
 
				+    if data_year is not None:
			
 
				+        file_filter_template["data_year"] = data_year
			
 
				+
			
 
				+    for input_folder in country_folders:
			
 
				+        input_folder = Path(input_folder)
			
 
				+        if input_folder.exists():
			
 
				+            # if desired find the latest date and only read that
			
 
				+            # has to be done per country
			
 
				+            if date == "latest":
			
 
				+                for country in country_codes:
			
 
				+                    file_filter = file_filter_template.copy()
			
 
				+                    file_filter["party"] = country
			
 
				+                    dates = get_submission_dates(folder, file_filter)
			
 
				+                    file_filter["date"] = find_latest_date(dates)
			
 
				+                    input_files = input_files + \
			
 
				+                                  filter_filenames(input_folder.glob("*.xlsx"),
			
 
				+                                                   **file_filter)
			
 
				+            else:
			
 
				+                file_filter = file_filter_template.copy()
			
 
				+                if date is not None:
			
 
				+                    file_filter["date"] = date
			
 
				+                input_files = input_files + \
			
 
				+                              filter_filenames(input_folder.glob("*.xlsx"),
			
 
				+                                               **file_filter)
			
 
				+        else:
			
 
				+            raise ValueError(f"Folder {input_folder} does not exist")
			
 
				+
			
 
				+    return input_files
			
 
				+
			
 
				+
			
 
				 def get_info_from_crf_filename(
			
 
				         filename: str,
			
 
				 ) -> Dict[str, Union[int, str]]:
			
@@ -814,6 +860,51 @@ def filter_category(
 
				     return new_mapping
			
 
				 
			
 
				 
			
 
				+def get_latest_date_for_country(
			
 
				+        country_code: str,
			
 
				+        submission_year: int,
			
 
				+)->str:
			
 
				+    """
			
 
				+    Find the latest submission date for a country
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+    country: str
			
 
				+        3-letter country code
			
 
				+
			
 
				+    submission_year: int
			
 
				+        Year of the submission to find the l;atest date for
			
 
				+
			
 
				+    Returns
			
 
				+    _______
			
 
				+        str: string with date
			
 
				+    """
			
 
				+
			
 
				+    with open(downloaded_data_path / "folder_mapping.json", "r") as mapping_file:
			
 
				+        folder_mapping = json.load(mapping_file)
			
 
				+
			
 
				+    if country_code in folder_mapping:
			
 
				+        file_filter = {}
			
 
				+        file_filter["party"] = country_code
			
 
				+        file_filter["submission_year"] = submission_year
			
 
				+        country_folders = folder_mapping[country_code]
			
 
				+        if isinstance(country_folders, str):
			
 
				+            # only one folder
			
 
				+            submission_date = find_latest_date(get_submission_dates(
			
 
				+                downloaded_data_path / country_folders / f"CRF{submission_year}", file_filter))
			
 
				+        else:
			
 
				+            dates = []
			
 
				+            for folder in country_folders:
			
 
				+                dates = dates + get_submission_dates(
			
 
				+                    downloaded_data_path / folder / f"CRF{submission_year}", file_filter)
			
 
				+            submission_date = find_latest_date(dates)
			
 
				+    else:
			
 
				+        raise ValueError(f"No data folder found for country {country_code}. "
			
 
				+                         f"Check if folder mapping is up to date.")
			
 
				+
			
 
				+    return submission_date
			
 
				+
			
 
				+
			
 
				 def get_submission_dates(
			
 
				         folder: Path,
			
 
				         file_filter: Dict[str, Union[str, int, List]],
			
@@ -840,6 +931,7 @@ def get_submission_dates(
 
				                          f"the function's purpose is to return available dates.")
			
 
				 
			
 
				     if folder.exists():
			
 
				+        print(folder)
			
 
				         files = filter_filenames(folder.glob("*.xlsx"), **file_filter)
			
 
				     else:
			
 
				         raise ValueError(f"Folder {folder} does not exist")
			
@@ -903,7 +995,10 @@ def find_latest_date(
 
				         str: latest date
			
 
				     """
			
 
				 
			
 
				-    dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
			
 
				-    dates_datetime = sorted(dates_datetime, key=itemgetter(1))
			
 
				+    if len(dates) > 0:
			
 
				+        dates_datetime = [[date, datetime.strptime(date, "%d%m%Y")] for date in dates]
			
 
				+        dates_datetime = sorted(dates_datetime, key=itemgetter(1))
			
 
				+    else:
			
 
				+        raise ValueError(f"Passed list of dates is empty")
			
 
				 
			
 
				     return dates_datetime[-1][0]
			
--- a/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
+++ b/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_devel.py
@@ -52,9 +52,11 @@ def save_unknown_categories_info(
 
				                     countries_cat = f"{countries_cat}; {country} ({years_country})"
			
 
				             processed_cats.append([table, cat, countries_cat])
			
 
				 
			
 
				-    folder = file.parents[0]
			
 
				-    if not folder.exists:
			
 
				-        folder.mkdir()
			
 
				+
			
 
				+    if not file.parents[1].exists():
			
 
				+        file.parents[1].mkdir()
			
 
				+    if not file.parents[0].exists():
			
 
				+        file.parents[0].mkdir()
			
 
				     df_processed_cats = pd.DataFrame(processed_cats, columns=["Table", "Category", "Countries"])
			
 
				     df_processed_cats.to_csv(file, index=False)
			
 
				 
			
@@ -103,8 +105,9 @@ def save_last_row_info(
 
				                     cats_country = f"{cats_country}; {cat} ({years_category})"
			
 
				             processed_last_row_info.append([table, country, cats_country])
			
 
				 
			
 
				-    folder = file.parents[0]
			
 
				-    if not folder.exists:
			
 
				-        folder.mkdir()
			
 
				+    if not file.parents[1].exists():
			
 
				+        file.parents[1].mkdir()
			
 
				+    if not file.parents[0].exists():
			
 
				+        file.parents[0].mkdir()
			
 
				     df_processed_lost_row_info = pd.DataFrame(processed_last_row_info, columns=["Table", "Country", "Categories"])
			
 
				     df_processed_lost_row_info.to_csv("test_last_row_info.csv", index=False)
			
--- a/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
+++ b/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
@@ -1,34 +1,42 @@
 
				-import re
			
 
				-from pathlib import Path
			
 
				-from treelib import Tree
			
 
				+#import re
			
 
				+#
			
 
				+#from treelib import Tree
			
 
				+
			
 
				 
			
 
				-import pandas as pd
			
 
				+#import pandas as pd
			
 
				 import xarray as xr
			
 
				 import primap2 as pm2
			
 
				+import numpy as np
			
 
				 import pycountry
			
 
				-import crf_specifications as crf
			
 
				-from typing import Dict, List, Optional, Tuple, Union
			
 
				+import datalad.api
			
 
				 from datetime import date
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+from . import crf_specifications as crf
			
 
				 
			
 
				 from .UNFCCC_CRF_reader_core import read_crf_table
			
 
				 from .UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
			
 
				+from .UNFCCC_CRF_reader_core import get_latest_date_for_country
			
 
				+from .UNFCCC_CRF_reader_core import get_crf_files
			
 
				 from .UNFCCC_CRF_reader_devel import save_unknown_categories_info
			
 
				 from .UNFCCC_CRF_reader_devel import save_last_row_info
			
 
				 
			
 
				-from . import log_path, custom_country_mapping, extracted_data_path
			
 
				+from .utils import code_path, log_path, \
			
 
				+    custom_country_mapping, extracted_data_path, root_path
			
 
				+
			
 
				+import sys
			
 
				+sys.path.append('../UNFCCC_reader')
			
 
				+from UNFCCC_reader.get_submissions_info import get_country_code
			
 
				+
			
 
				 
			
 
				 # functions:
			
 
				-# * production functions
			
 
				-# ** read one table for a country
			
 
				-# ** read alist of tables for one country
			
 
				-# ** convert to IF and NC and save
			
 
				 # * testing fucntions
			
 
				 # ** read one or more table(s) for all countries
			
 
				 #    (and a if desired only a single year) and write
			
 
				 #    output files with missing sectors etc
			
 
				 # **
			
 
				 
			
 
				-# TODO: add saving to read_crf_for_country
			
 
				 # TODO: add function to read several / all countries
			
 
				 
			
 
				 
			
@@ -44,6 +52,7 @@ from . import log_path, custom_country_mapping, extracted_data_path
 
				 def read_crf_for_country(
			
 
				         country_code: str,
			
 
				         submission_year: int,
			
 
				+        submission_date: Optional[str]=None,
			
 
				 ) -> xr.Dataset:
			
 
				     """
			
 
				     Read CRF data for given submission year and country. All tables
			
@@ -77,23 +86,17 @@ def read_crf_for_country(
 
				     submission_year: int
			
 
				         Year of the submission of the data
			
 
				 
			
 
				+    submission_data: Optional(str)
			
 
				+        Read for a specific submission date (given as string as in the file names)
			
 
				+        If not specified latest data will be read
			
 
				+
			
 
				     Returns
			
 
				     _______
			
 
				-        first return value is a Pandas DataFrame with the read data in long format
			
 
				-        second return value
			
 
				-        third return value TODO
			
 
				-
			
 
				+        return value is a Pandas DataFrame with the read data in PRIMAP2 format
			
 
				     """
			
 
				+
			
 
				     # get country name
			
 
				-    if country_code in custom_country_mapping:
			
 
				-        country_name = custom_country_mapping(country_code)
			
 
				-    else:
			
 
				-        try:
			
 
				-            country = pycountry.countries.get(alpha_3=country_code)
			
 
				-            country_name = country.name
			
 
				-        except:
			
 
				-            raise ValueError(f"Country code {country_code} can not be mapped to "
			
 
				-                             f"any country")
			
 
				+    country_name = get_country_name(country_code)
			
 
				 
			
 
				     # get specification and available tables
			
 
				     try:
			
@@ -107,8 +110,8 @@ def read_crf_for_country(
 
				     print(f"The following tables are available in the " \
			
 
				           f"CRF{submission_year} specification: {tables}")
			
 
				 
			
 
				-    # TODO: get available dates (first get folders for country, then dates, select latest date and passt on)
			
 
				-    # dates need to be determined here.
			
 
				+    if submission_date is None:
			
 
				+        submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				 
			
 
				     ds_all = None
			
 
				     unknown_categories = []
			
@@ -116,7 +119,7 @@ def read_crf_for_country(
 
				     for table in tables:
			
 
				         # read table for all years
			
 
				         ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
			
 
				-            country_code, table, submission_year, folder="CRF2021")#, data_year=[1990])
			
 
				+            country_code, table, submission_year, date=submission_date)#, data_year=[1990])
			
 
				 
			
 
				         # collect messages on unknown rows etc
			
 
				         unknown_categories = unknown_categories + new_unknown_categories
			
@@ -135,7 +138,9 @@ def read_crf_for_country(
 
				         ds_table_if = convert_crf_table_to_pm2if(
			
 
				             ds_table,
			
 
				             2021,
			
 
				-            meta_data_input={"title": "DEU"},
			
 
				+            meta_data_input={"title": f"Data submitted in {submission_year} to the UNFCCC "
			
 
				+                                      f"in the common reporting format (CRF) by {country_name}. "
			
 
				+                                      f"Submission date: {submission_date}"},
			
 
				             entity_mapping=entity_mapping,
			
 
				         )
			
 
				 
			
@@ -146,14 +151,7 @@ def read_crf_for_country(
 
				         if ds_all is None:
			
 
				             ds_all = ds_table_pm2
			
 
				         else:
			
 
				-            ds_all = xr.combine_by_coords(data_objects=[ds_all, ds_table_pm2],
			
 
				-                                          compat='override',
			
 
				-                                          data_vars='all',
			
 
				-                                          coords='all',
			
 
				-                                          fill_value=np.nan,
			
 
				-                                          #join='outer',
			
 
				-                                          combine_attrs='drop_conflicts'
			
 
				-                                          )
			
 
				+            ds_all = ds_all.combine_first(ds_table_pm2)
			
 
				 
			
 
				     # check if there were log messages.
			
 
				     save_data = True
			
@@ -161,7 +159,7 @@ def read_crf_for_country(
 
				         save_data = False
			
 
				         today = date.today()
			
 
				         log_location = log_path / f"CRF{submission_year}" \
			
 
				-                       / f"{country_code}_unknown_categories_{today.strftime('%d/%m/%Y')}.csv"
			
 
				+                       / f"{country_code}_unknown_categories_{today.strftime('%Y-%m-%d')}.csv"
			
 
				         print(f"Unknown rows found for {country_code}. Not saving data. Savin log to "
			
 
				               f"{log_location}" )
			
 
				         save_unknown_categories_info(unknown_categories, log_location)
			
@@ -170,29 +168,137 @@ def read_crf_for_country(
 
				         save_data = False
			
 
				         today = date.today()
			
 
				         log_location = log_path / f"CRF{submission_year}" \
			
 
				-                       / f"{country_code}_last_row_info_{today.strftime('%d/%m/%Y')}.csv"
			
 
				+                       / f"{country_code}_last_row_info_{today.strftime('%Y-%m-%d')}.csv"
			
 
				         print(f"Data found in the last row found for {country_code}. Not saving data. Savin log to "
			
 
				               f"{log_location}")
			
 
				         save_last_row_info(last_row_info, log_location)
			
 
				 
			
 
				     if save_data:
			
 
				+        compression = dict(zlib=True, complevel=9)
			
 
				         output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				-        output_filename = f"{country_code}_CRF{submission_year}_
			
 
				-
			
 
				-# TODO: need to consider the date when reading, there might be multiple submissions...
			
 
				+        output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
			
 
				 
			
 
				         if not output_folder.exists():
			
 
				             output_folder.mkdir()
			
 
				+            # folder mapping has to be updated !!!
			
 
				+            # if we do it here we will do it a lot of times when reading several countries at once
			
 
				 
			
 
				-        # write data in interchnange formart
			
 
				-        pm2.pm2io.write_interchange_format(output_folder / (output_filename + coords_terminologies["category"]), data_if)
			
 
				+        # write data in interchange format
			
 
				+        pm2.pm2io.write_interchange_format(output_folder / output_filename,
			
 
				+                                           ds_all.pr.to_interchange_format())
			
 
				 
			
 
				         # write data in native PRIAMP2 formart
			
 
				-        data_pm2 = pm2.pm2io.from_interchange_format(data_if)
			
 
				-        encoding = {var: compression for var in data_pm2.data_vars}
			
 
				-        data_pm2.pr.to_netcdf(output_folder / (output_filename + coords_terminologies["category"] + ".nc"),
			
 
				+        encoding = {var: compression for var in ds_all.data_vars}
			
 
				+        ds_all.pr.to_netcdf(output_folder / (output_filename + ".nc"),
			
 
				                               encoding=encoding)
			
 
				 
			
 
				     return ds_all
			
 
				 
			
 
				 
			
 
				+def read_crf_for_country_datalad(
			
 
				+        country: str,
			
 
				+        submission_year: int,
			
 
				+        submission_date: Optional[str]=None,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Wrapper around read_crf_for_country which takes care of selecting input
			
 
				+    and output files and using datalad run to trigger the data reading
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+
			
 
				+    country_codes: str
			
 
				+        ISO 3-letter country code
			
 
				+
			
 
				+    submission_year: int
			
 
				+        Year of the submission of the data
			
 
				+
			
 
				+    submission_date: Optional(str)
			
 
				+        Read for a specific submission date (given as string as in the file names)
			
 
				+        If not specified latest data will be read
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    # get the country code and name
			
 
				+    # both could be given as input, so we need this two step process
			
 
				+    if country in custom_country_mapping:
			
 
				+        country_code = country
			
 
				+    else:
			
 
				+        country_code = get_country_code(country)
			
 
				+    # now get the country name
			
 
				+    country_name = get_country_name(country_code)
			
 
				+
			
 
				+    print(f"Attempting to read data for CRF{submission_year} from {country}.")
			
 
				+    print("#"*80)
			
 
				+    print("")
			
 
				+
			
 
				+    print(f"Using the UNFCCC_CRF_reader")
			
 
				+    print("")
			
 
				+
			
 
				+    # get possible input files
			
 
				+    input_files = get_crf_files(country_codes=country_code,
			
 
				+                                submission_year=submission_year,
			
 
				+                                date=submission_date)
			
 
				+    if not input_files:
			
 
				+        if submission_date is not None:
			
 
				+            print(f"No possible input files found for {country}, CRF{submission_year}, "
			
 
				+                  f"v{submission_date}. Are they already submitted and included in the "
			
 
				+                  f"repository?")
			
 
				+        else:
			
 
				+            print(f"No possible input files found for {country}, CRF{submission_year}. "
			
 
				+                  f"Are they already submitted and included in the repository?")
			
 
				+    else:
			
 
				+        print(f"Found the following input_files:")
			
 
				+        for file in input_files:
			
 
				+            print(file.name)
			
 
				+        print("")
			
 
				+
			
 
				+    # convert file's path to str
			
 
				+    input_files = [file.as_posix() for file in input_files]
			
 
				+
			
 
				+    # get output file
			
 
				+    if submission_date is None:
			
 
				+        submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				+
			
 
				+    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+    output_files = [output_folder / f"{country_code}_CRF{submission_year}"
			
 
				+                    f"_{submission_date}.{suffix}" for suffix
			
 
				+                    in ['yaml', 'csv', 'nc']]
			
 
				+    print(f"The following files are considered as output_files:")
			
 
				+    for file in output_files:
			
 
				+        print(file)
			
 
				+    print("")
			
 
				+
			
 
				+    # convert file paths to str
			
 
				+    output_files = [file.as_posix() for file in output_files]
			
 
				+
			
 
				+    print(f"Run the script using datalad run via the python api")
			
 
				+    script = code_path / "UNFCCC_CRF_reader" / "read_UNFCCC_CRF_submission.py"
			
 
				+    datalad.api.run(
			
 
				+        cmd=f"./venv/bin/python3 {script.name} --country={country} "
			
 
				+            f"--submission_year={submission_year} --submission_date=={submission_date}",
			
 
				+        dataset=root_path,
			
 
				+        message=f"Read data for {country}, CRF{submission_year}, {submission_date}.",
			
 
				+        inputs=input_files,
			
 
				+        outputs=output_files,
			
 
				+        dry_run=None,
			
 
				+        explicit=True,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_country_name(
			
 
				+        country_code: str,
			
 
				+) -> str:
			
 
				+    """get country name from code """
			
 
				+    if country_code in custom_country_mapping:
			
 
				+        country_name = custom_country_mapping(country_code)
			
 
				+    else:
			
 
				+        try:
			
 
				+            country = pycountry.countries.get(alpha_3=country_code)
			
 
				+            country_name = country.name
			
 
				+        except:
			
 
				+            raise ValueError(f"Country code {country_code} can not be mapped to "
			
 
				+                             f"any country")
			
 
				+
			
 
				+    return country_name
			
--- a/code/UNFCCC_CRF_reader/__init__.py
+++ b/code/UNFCCC_CRF_reader/__init__.py
@@ -3,19 +3,6 @@ CRF reader module
 
				 """
			
 
				 
			
 
				 from pathlib import Path
			
 
				+#from . import crf_specifications
			
 
				+from .UNFCCC_CRF_reader_prod import read_crf_for_country, read_crf_for_country_datalad
			
 
				 
			
 
				-from .UNFCCC_CRF_reader import read_crf_for_country
			
 
				-
			
 
				-root_path = Path(__file__).parents[3]
			
 
				-log_path = root_path / "log"
			
 
				-downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
			
 
				-extracted_data_path = root_path / ""
			
 
				-
			
 
				-custom_country_mapping = {
			
 
				-    "EUA": "European Union",
			
 
				-    "EUC": "European Union",
			
 
				-    "FRK": "France",
			
 
				-    "DKE": "Denmark",
			
 
				-    "DNM": "Denmark",
			
 
				-    "GBK": "United Kingdom",
			
 
				-}
			
--- a/code/UNFCCC_CRF_reader/crf_specifications/CRF2021_specification.py
+++ b/code/UNFCCC_CRF_reader/crf_specifications/CRF2021_specification.py
@@ -1717,7 +1717,7 @@ CRF2021 = {
 
				         "sector_mapping": [
			
 
				             ['Total actual emissions of halocarbons (by chemical) and SF6', ['2']],
			
 
				             ['B. Chemical industry', ['2.B']],
			
 
				-            ['9. Flurochemical production', ['2.9']],
			
 
				+            ['9. Flurochemical production', ['2.B.9']],
			
 
				             ['By-product emissions', ['2.B.9.a']],
			
 
				             ['Fugitive emissions', ['2.B.9.b']],
			
 
				             ['10. Other', ['2.B.10']],
			
--- a/code/UNFCCC_CRF_reader/crf_specifications/CRF2022_specification.py
+++ b/code/UNFCCC_CRF_reader/crf_specifications/CRF2022_specification.py
@@ -0,0 +1,6 @@
 
				+#import numpy as np
			
 
				+#from .util import unit_info
			
 
				+
			
 
				+from .CRF2021_specification import CRF2021
			
 
				+
			
 
				+CRF2022 = CRF2021
			
--- a/code/UNFCCC_CRF_reader/crf_specifications/__init__.py
+++ b/code/UNFCCC_CRF_reader/crf_specifications/__init__.py
@@ -3,4 +3,4 @@ Define the CRF specifications here for easy access
 
				 """
			
 
				 
			
 
				 from .CRF2021_specification import CRF2021
			
 
				-
			
 
				+from .CRF2022_specification import CRF2022
			
--- a/code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission.py
+++ b/code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission.py
@@ -0,0 +1,24 @@
 
				+"""
			
 
				+This script is a wrapper around the read_crf_for_country
			
 
				+function such that it can be called from datalad
			
 
				+"""
			
 
				+
			
 
				+from . import read_crf_for_country
			
 
				+import argparse
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country name or code')
			
 
				+parser.add_argument('--submission_year', help='Submission round to read')
			
 
				+parser.add_argument('--submission_date', help='Date of submission to read', default=None)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+country = args.country
			
 
				+submission_year = args.submission_year
			
 
				+submission_date = args.submission_date
			
 
				+
			
 
				+read_crf_for_country(
			
 
				+    country,
			
 
				+    submission_year=submission_year,
			
 
				+    submission_date=submission_date)
			
 
				+
			
--- a/code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission_datalad.py
+++ b/code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission_datalad.py
@@ -0,0 +1,24 @@
 
				+"""
			
 
				+wrapper around read_crf_for_country_datalad such that it can be called
			
 
				+from doit in the current setup where doit runs on system python and
			
 
				+not in the venv.
			
 
				+"""
			
 
				+
			
 
				+from . import read_crf_for_country_datalad
			
 
				+import argparse
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('--country', help='Country name or code')
			
 
				+parser.add_argument('--submission_year', help='Submission round to read')
			
 
				+parser.add_argument('--submission_date', help='Date of submission to read', default=None)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+country = args.country
			
 
				+submission_year = args.submission_year
			
 
				+submission_date = args.submission_date
			
 
				+
			
 
				+read_crf_for_country_datalad(
			
 
				+        country,
			
 
				+        submission_year=submission_year,
			
 
				+        submission_date=submission_date)
			
--- a/code/UNFCCC_CRF_reader/utils.py
+++ b/code/UNFCCC_CRF_reader/utils.py
@@ -0,0 +1,30 @@
 
				+from pathlib import Path
			
 
				+
			
 
				+# 4 for use from nbs, fix
			
 
				+root_path = Path(__file__).parents[3].absolute()
			
 
				+log_path = root_path / "log"
			
 
				+code_path = root_path / "code"
			
 
				+downloaded_data_path = root_path / "downloaded_data" / "UNFCCC"
			
 
				+extracted_data_path = root_path / "extracted_data" / "UNFCCC"
			
 
				+
			
 
				+custom_country_mapping = {
			
 
				+    "EUA": "European Union",
			
 
				+    "EUC": "European Union",
			
 
				+    "FRK": "France",
			
 
				+    "DKE": "Denmark",
			
 
				+    "DNM": "Denmark",
			
 
				+    "GBK": "United Kingdom",
			
 
				+}
			
 
				+
			
 
				+all_crf_countries = [
			
 
				+    'AUS', 'AUT', 'BEL', 'BGR', 'BLR',
			
 
				+    'CAN', 'CHE', 'CYP', 'CZE', 'DEU', # 10
			
 
				+    'DKE', 'DNK', 'DNM', 'ESP', 'EST',
			
 
				+    'EUA', 'EUC', 'FIN', 'FRA', 'FRK', # 20
			
 
				+    'GBK', 'GBR', 'GRC', 'HRV', 'HUN',
			
 
				+    'IRL', 'ISL', 'ITA', 'JPN', 'KAZ', # 30
			
 
				+    'LIE', 'LTU', 'LUX', 'LVA', 'MCO',
			
 
				+    'MLT', 'NLD', 'NOR', 'NZL', 'POL', # 40
			
 
				+    'PRT', 'ROU', 'RUS', 'SVK', 'SVN',
			
 
				+    'SWE', 'TUR', 'UKR', 'USA', # 49
			
 
				+]
			
--- a/code/UNFCCC_reader/get_submissions_info.py
+++ b/code/UNFCCC_reader/get_submissions_info.py
@@ -35,18 +35,7 @@ def get_country_submissions(
 
				     codepath = Path(__file__).parent
			
 
				     data_folder = codepath / ".." / ".." / "downloaded_data"
			
 
				 
			
 
				-    # obtain country code
			
 
				-    #country_code = countrynames.to_code_3(country_name)
			
 
				-    try:
			
 
				-        country = pycountry.countries.search_fuzzy(country_name)
			
 
				-    except:
			
 
				-        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                         f"any country code")
			
 
				-    if len(country) > 1:
			
 
				-        raise ValueError(f"Country name {country_name} has {len(country)} "
			
 
				-                         f"possible results for country codes.")
			
 
				-
			
 
				-    country_code = country[0].alpha_3
			
 
				+    country_code = get_country_code(country_name)
			
 
				 
			
 
				     if print_sub:
			
 
				         print(f"Country name {country_name} maps to ISO code {country_code}")
			
@@ -122,10 +111,7 @@ def get_country_datasets(
 
				 
			
 
				 
			
 
				     # obtain country code
			
 
				-    country_code = countrynames.to_code_3(country_name)
			
 
				-    if country_code is None:
			
 
				-        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                         f"any country code")
			
 
				+    country_code = get_country_code(country_name)
			
 
				 
			
 
				     if print_ds:
			
 
				         print(f"Country name {country_name} maps to ISO code {country_code}")
			
@@ -270,6 +256,43 @@ def get_country_datasets(
 
				     return all_data
			
 
				 
			
 
				 
			
 
				+def get_country_code(
			
 
				+        country_name: str,
			
 
				+)->str:
			
 
				+    """
			
 
				+    obtain country code. If the input is a code it will be returned, if the input
			
 
				+    is not a three letter code a search will be performed
			
 
				+
			
 
				+    Parameters
			
 
				+    __________
			
 
				+    country_name: str
			
 
				+        Country code or name to get the three-letter code for.
			
 
				+
			
 
				+    """
			
 
				+    try:
			
 
				+        # check if it's a 3 letter code
			
 
				+        country = pycountry.countries.get(alpha_3=country_name)
			
 
				+        country_code = country.alpha_3
			
 
				+    except:
			
 
				+        try:
			
 
				+            country = pycountry.countries.search_fuzzy(country_name)
			
 
				+        except:
			
 
				+            raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				+                             f"any country code")
			
 
				+        if len(country) > 1:
			
 
				+            country_code = None
			
 
				+            for current_country in country:
			
 
				+                if current_country.name == country_name:
			
 
				+                    country_code = current_country.alpha_3
			
 
				+            if country_code is None:
			
 
				+                raise ValueError(f"Country name {country_name} has {len(country)} "
			
 
				+                                 f"possible results for country codes.")
			
 
				+
			
 
				+        country_code = country[0].alpha_3
			
 
				+
			
 
				+    return country_code
			
 
				+
			
 
				+
			
 
				 def get_possible_inputs(
			
 
				         country_name: str,
			
 
				         submission: str,
			
@@ -302,10 +325,7 @@ def get_possible_inputs(
 
				     data_folder = rootpath / "downloaded_data"
			
 
				 
			
 
				     # obtain country code
			
 
				-    country_code = countrynames.to_code_3(country_name)
			
 
				-    if country_code is None:
			
 
				-        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                         f"any country code")
			
 
				+    country_code = get_country_code(country_name)
			
 
				 
			
 
				     if print_info:
			
 
				         print(f"Country name {country_name} maps to ISO code {country_code}")
			
@@ -371,11 +391,7 @@ def get_possible_outputs(
 
				     data_folder = rootpath / "extracted_data"
			
 
				 
			
 
				     # obtain country code
			
 
				-    country_code = countrynames.to_code_3(country_name)
			
 
				-    if country_code is None:
			
 
				-        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                         f"any country code")
			
 
				-
			
 
				+    country_code = get_country_code(country_name)
			
 
				     if print_info:
			
 
				         print(f"Country name {country_name} maps to ISO code {country_code}")
			
 
				 
			
@@ -436,11 +452,13 @@ def get_code_file(
 
				     rootpath = rootpath.resolve()
			
 
				     code_file_path = None
			
 
				 
			
 
				+    # CRF is an exception as it's read using the UNFCCC_CRF_reader module
			
 
				+    # so we return the path to that.
			
 
				+    if submission[0:3] == "CRF":
			
 
				+        return rootpath / "UNFCCC_CRF_reader"
			
 
				+
			
 
				     # obtain country code
			
 
				-    country_code = countrynames.to_code_3(country_name)
			
 
				-    if country_code is None:
			
 
				-        raise ValueError(f"Country name {country_name} can not be mapped to "
			
 
				-                         f"any country code")
			
 
				+    country_code = get_country_code(country_name)
			
 
				 
			
 
				     if print_info:
			
 
				         print(f"Country name {country_name} maps to ISO code {country_code}")
			
@@ -473,6 +491,7 @@ def get_code_file(
 
				     else:
			
 
				         return None
			
 
				 
			
 
				+
			
 
				 def create_folder_mapping(
			
 
				         folder: str,
			
 
				         extracted: bool = False
			
@@ -514,7 +533,18 @@ def create_folder_mapping(
 
				 
			
 
				     for item in folder.iterdir():
			
 
				         if item.is_dir():
			
 
				-            ISO3 = countrynames.to_code_3(item.name)
			
 
				+            try:
			
 
				+                country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
			
 
				+                if len(country) > 1:
			
 
				+                    ISO3 = None
			
 
				+                    for current_country in country:
			
 
				+                        if current_country.name == item.name.replace("_", " "):
			
 
				+                            ISO3 = current_country.alpha_3
			
 
				+                else:
			
 
				+                    ISO3 = country[0].alpha_3
			
 
				+            except:
			
 
				+                ISO3 = None
			
 
				+
			
 
				             if ISO3 is None:
			
 
				                 if item.name not in known_folders:
			
 
				                     print(folder_mapping.values())
			
--- a/code/UNFCCC_reader/read_UNFCCC_submission.py
+++ b/code/UNFCCC_reader/read_UNFCCC_submission.py
@@ -32,6 +32,7 @@ print("")
 
				 
			
 
				 # get the correct script
			
 
				 script_name = get_code_file(country, submission)
			
 
				+
			
 
				 if script_name is not None:
			
 
				     print(f"Found code file {script_name}")
			
 
				     print("")
			
--- a/dodo.py
+++ b/dodo.py
@@ -72,8 +72,6 @@ def task_update_nc():
 
				     }
			
 
				 
			
 
				 
			
 
				-
			
 
				-
			
 
				 def task_download_nc():
			
 
				     """ Download NC submissions """
			
 
				     return {
			
@@ -143,9 +141,8 @@ read_config = {
 
				     "submission": get_var('submission', None),
			
 
				 }
			
 
				 
			
 
				-
			
 
				 def task_read_unfccc_submission():
			
 
				-    """ Read submission for a country (if code exists) """
			
 
				+    """ Read submission for a country (if code exists) (not for CRF)"""
			
 
				     return {
			
 
				         'actions': [f"./venv/bin/python code/UNFCCC_reader/read_UNFCCC_submission.py "
			
 
				                     f"--country={read_config['country']} --submission={read_config['submission']}"],
			
@@ -153,6 +150,25 @@ def task_read_unfccc_submission():
 
				         'setup': ['setup_venv'],
			
 
				     }
			
 
				 
			
 
				+# read UNFCCC submissions.
			
 
				+# datalad run is called from within the read_UNFCCC_submission.py script
			
 
				+read_config_crf = {
			
 
				+    "country": get_var('country', None),
			
 
				+    "submission_year": get_var('submission_year', None),
			
 
				+    "submission_date": get_var('submission_date', None),
			
 
				+}
			
 
				+
			
 
				+def task_read_unfccc_crf_submission():
			
 
				+    """ Read CRF submission for a country """
			
 
				+    return {
			
 
				+        'actions': [f"./venv/bin/python code/UNFCCC_CRF_reader/read_UNFCCC_CRF_submission_datalad.py "
			
 
				+                    f"--country={read_config_crf['country']} "
			
 
				+                    f"--submission_year={read_config_crf['submission_year']} "
			
 
				+                    f"--submission_date={read_config['submission_date']}"],
			
 
				+        'verbosity': 2,
			
 
				+        'setup': ['setup_venv'],
			
 
				+    }
			
 
				+
			
 
				 
			
 
				 def task_country_info():
			
 
				     """ Print information on submissions and datasets