2 yıl önce · eeee5577ba
--- a/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
+++ b/code/UNFCCC_CRF_reader/UNFCCC_CRF_reader_prod.py
@@ -11,7 +11,7 @@ import pycountry
 
				 import datalad.api
			
 
				 from datetime import date
			
 
				 from pathlib import Path
			
 
				-from typing import Optional, List
			
 
				+from typing import Optional, List, Dict, Union
			
 
				 
			
 
				 #from . import crf_specifications as crf
			
 
				 import crf_specifications as crf
			
@@ -119,27 +119,13 @@ def read_crf_for_country(
 
				         submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				 
			
 
				     # check if data has been read already
			
 
				-    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				-    output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
			
 
				-    read_data = True
			
 
				-    print(f"re_read: {re_read}")
			
 
				-    if not re_read:
			
 
				-        print("test")
			
 
				-        if output_folder.exists():
			
 
				-            existing_files = output_folder.glob(f"{output_filename}.*")
			
 
				-            existing_suffixes= [file.suffix for file in existing_files]
			
 
				-            if all(suffix in existing_suffixes for suffix in [".nc", ".yaml", ".csv"]):
			
 
				-                print(f"Data already available for {country_code}, "
			
 
				-                      f"CRF{submission_year}, version {submission_date}. "
			
 
				-                      "Skipping.")
			
 
				-            else:
			
 
				-                print(f"Partial data available for {country_code}, "
			
 
				-                      f"CRF{submission_year}, version {submission_date}. "
			
 
				-                      "Reading for completion. Please check if all files "
			
 
				-                      "have been written.")
			
 
				+    read_data = not submission_has_been_read(
			
 
				+        country_code, country_name, submission_year=submission_year,
			
 
				+        submission_date=submission_date, verbose=True,
			
 
				+    )
			
 
				 
			
 
				     ds_all = None
			
 
				-    if read_data:
			
 
				+    if read_data or re_read:
			
 
				         unknown_categories = []
			
 
				         last_row_info = []
			
 
				         for table in tables:
			
@@ -245,67 +231,15 @@ def read_crf_for_country_datalad(
 
				 
			
 
				     """
			
 
				 
			
 
				-    # get the country code and name
			
 
				-    # both could be given as input, so we need this two step process
			
 
				-    if country in custom_country_mapping:
			
 
				-        country_code = country
			
 
				-    else:
			
 
				-        country_code = get_country_code(country)
			
 
				-    # now get the country name
			
 
				-    country_name = get_country_name(country_code)
			
 
				+    # get all the info for the country
			
 
				+    country_info = get_input_and_output_files_for_country(
			
 
				+        country, submission_year=submission_year, verbose=True)
			
 
				 
			
 
				     print(f"Attempting to read data for CRF{submission_year} from {country}.")
			
 
				     print("#"*80)
			
 
				     print("")
			
 
				-
			
 
				     print(f"Using the UNFCCC_CRF_reader")
			
 
				     print("")
			
 
				-
			
 
				-    # determine latest data
			
 
				-    if submission_date is None:
			
 
				-        print(f"No submission date given, find latest date.")
			
 
				-        submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				-    else:
			
 
				-        print(f"Using given submissions date {submission_date}")
			
 
				-
			
 
				-    if submission_date is None:
			
 
				-        # there is no data. Raise an exception
			
 
				-        raise NoCRFFilesError(f"No submissions found for {country_code}, "
			
 
				-                              f"submission_year={submission_year}, "
			
 
				-                              f"date={date}")
			
 
				-    else:
			
 
				-        print(f"Latest submission date for CRF{submission_year} is {submission_date}")
			
 
				-
			
 
				-    # get possible input files
			
 
				-    input_files = get_crf_files(country_codes=country_code,
			
 
				-                                submission_year=submission_year,
			
 
				-                                date=submission_date)
			
 
				-    if not input_files:
			
 
				-        raise NoCRFFilesError(f"No possible input files found for {country}, CRF{submission_year}, "
			
 
				-                              f"v{submission_date}. Are they already submitted and included in the "
			
 
				-                              f"repository?")
			
 
				-    else:
			
 
				-        print(f"Found the following input_files:")
			
 
				-        for file in input_files:
			
 
				-            print(file.name)
			
 
				-        print("")
			
 
				-
			
 
				-    # convert file's path to str
			
 
				-    input_files = [file.as_posix() for file in input_files]
			
 
				-
			
 
				-    # get output file
			
 
				-    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				-    output_files = [output_folder / f"{country_code}_CRF{submission_year}"
			
 
				-                    f"_{submission_date}.{suffix}" for suffix
			
 
				-                    in ['yaml', 'csv', 'nc']]
			
 
				-    print(f"The following files are considered as output_files:")
			
 
				-    for file in output_files:
			
 
				-        print(file)
			
 
				-    print("")
			
 
				-
			
 
				-    # convert file paths to str
			
 
				-    output_files = [file.as_posix() for file in output_files]
			
 
				-
			
 
				     print(f"Run the script using datalad run via the python api")
			
 
				     script = code_path / "UNFCCC_CRF_reader" / "read_UNFCCC_CRF_submission.py"
			
 
				 
			
@@ -314,8 +248,8 @@ def read_crf_for_country_datalad(
 
				             f"--submission_year={submission_year} --submission_date={submission_date}",
			
 
				         dataset=root_path,
			
 
				         message=f"Read data for {country}, CRF{submission_year}, {submission_date}.",
			
 
				-        inputs=input_files,
			
 
				-        outputs=output_files,
			
 
				+        inputs=country_info["input"],
			
 
				+        outputs=country_info["output"],
			
 
				         dry_run=None,
			
 
				         explicit=True,
			
 
				     )
			
@@ -418,72 +352,56 @@ def read_new_crf_for_year_datalad(
 
				 
			
 
				     """
			
 
				 
			
 
				+    if countries is not None:
			
 
				+        print(f"Reading CRF{submission_year} for countries {countries} using UNFCCC_CRF_reader.")
			
 
				+    else:
			
 
				+        print(f"Reading CRF{submission_year} for all countries using UNFCCC_CRF_reader.")
			
 
				+        countries = all_crf_countries
			
 
				+    print("#" * 80)
			
 
				+    print("")
			
 
				+    if re_read:
			
 
				+        print("Re-reading all latest submissions.")
			
 
				+    else:
			
 
				+        print("Only reading new submissions not read yet.")
			
 
				+
			
 
				+
			
 
				     input_files = []
			
 
				     output_files = []
			
 
				     # loop over countries to collect input and output files
			
 
				+    print("Collect input and output files to pass to datalad")
			
 
				     for country in countries:
			
 
				-        # get the country code and name
			
 
				-        # both could be given as input, so we need this two step process
			
 
				-        if country in custom_country_mapping:
			
 
				-            country_code = country
			
 
				-        else:
			
 
				-            country_code = get_country_code(country)
			
 
				-        # now get the country name
			
 
				-        country_name = get_country_name(country_code)
			
 
				-
			
 
				-        print(f"Attempting to read data for CRF{submission_year} from {country}.")
			
 
				-        print("#"*80)
			
 
				-        print("")
			
 
				-
			
 
				-        print(f"Using the UNFCCC_CRF_reader")
			
 
				-        print("")
			
 
				-
			
 
				-        # get possible input files
			
 
				-        new_input_files = get_crf_files(country_codes=country_code,
			
 
				-                                    submission_year=submission_year,
			
 
				-                                    date=submission_date)
			
 
				-        new_input
			
 
				-        if not input_files:
			
 
				-            if submission_date is not None:
			
 
				-                print(f"No possible input files found for {country}, CRF{submission_year}, "
			
 
				-                      f"v{submission_date}. Are they already submitted and included in the "
			
 
				-                      f"repository?")
			
 
				+        try:
			
 
				+            country_info = get_input_and_output_files_for_country(
			
 
				+                country, submission_year=submission_year, verbose=False)
			
 
				+            # check if the submission has been read already
			
 
				+            if re_read:
			
 
				+                input_files = input_files + country_info["input"]
			
 
				+                output_files = output_files + country_info["output"]
			
 
				             else:
			
 
				-                print(f"No possible input files found for {country}, CRF{submission_year}. "
			
 
				-                      f"Are they already submitted and included in the repository?")
			
 
				-        else:
			
 
				-            print(f"Found the following input_files:")
			
 
				-            for file in input_files:
			
 
				-                print(file.name)
			
 
				-            print("")
			
 
				-
			
 
				-    # convert file's path to str
			
 
				-    input_files = [file.as_posix() for file in input_files]
			
 
				-
			
 
				-    # get output file
			
 
				-    if submission_date is None:
			
 
				-        submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				-
			
 
				-    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				-    output_files = [output_folder / f"{country_code}_CRF{submission_year}"
			
 
				-                    f"_{submission_date}.{suffix}" for suffix
			
 
				-                    in ['yaml', 'csv', 'nc']]
			
 
				-    print(f"The following files are considered as output_files:")
			
 
				-    for file in output_files:
			
 
				-        print(file)
			
 
				-    print("")
			
 
				-
			
 
				-    # convert file paths to str
			
 
				-    output_files = [file.as_posix() for file in output_files]
			
 
				+                data_read = submission_has_been_read(
			
 
				+                    country_info["code"], country_info["name"],
			
 
				+                    submission_year=submission_year,
			
 
				+                    submission_date=country_info["date"],
			
 
				+                    verbose=False,
			
 
				+                )
			
 
				+                if not data_read:
			
 
				+                    input_files = input_files + country_info["input"]
			
 
				+                    output_files = output_files + country_info["output"]
			
 
				+        except:
			
 
				+            # no error handling here as that is done in the function that does the actual reading
			
 
				+            pass
			
 
				 
			
 
				     print(f"Run the script using datalad run via the python api")
			
 
				-    script = code_path / "UNFCCC_CRF_reader" / "read_UNFCCC_CRF_submission.py"
			
 
				+    script = code_path / "UNFCCC_CRF_reader" / "read_new_UNFCCC_CRF_for year.py"
			
 
				 
			
 
				+    cmd = f"./venv/bin/python3 {script.as_posix()} --countries={countries} "\
			
 
				+          f"--submission_year={submission_year}"
			
 
				+    if re_read:
			
 
				+        cmd = cmd + " --re_read"
			
 
				     datalad.api.run(
			
 
				-        cmd=f"./venv/bin/python3 {script.as_posix()} --country={country} "
			
 
				-            f"--submission_year={submission_year} --submission_date={submission_date}",
			
 
				+        cmd=cmd,
			
 
				         dataset=root_path,
			
 
				-        message=f"Read data for {country}, CRF{submission_year}, {submission_date}.",
			
 
				+        message=f"Read data for {countries}, CRF{submission_year}. Re-reading: {re_read}",
			
 
				         inputs=input_files,
			
 
				         outputs=output_files,
			
 
				         dry_run=None,
			
@@ -496,6 +414,85 @@ def read_new_crf_for_year_datalad(
 
				 # give option to only read new data (no output yet), but also option to
			
 
				 # read all data, e.g. when specifications have changed
			
 
				 
			
 
				+def get_input_and_output_files_for_country(
			
 
				+        country: str,
			
 
				+        submission_year: int,
			
 
				+        submission_date: Optional[str]=None,
			
 
				+        verbose: Optional[bool]=True,
			
 
				+) -> Dict[str, Union[List, str]]:
			
 
				+    """
			
 
				+    Get input and output files for a given country
			
 
				+    """
			
 
				+
			
 
				+    country_info = {}
			
 
				+
			
 
				+    if country in custom_country_mapping:
			
 
				+        country_code = country
			
 
				+    else:
			
 
				+        country_code = get_country_code(country)
			
 
				+    # now get the country name
			
 
				+    country_name = get_country_name(country_code)
			
 
				+    country_info["code"] = country_code
			
 
				+    country_info["name"] = country_name
			
 
				+
			
 
				+    # determine latest data
			
 
				+    print(f"Determining input and output files for {country}")
			
 
				+    if submission_date is None:
			
 
				+        if verbose:
			
 
				+            print(f"No submission date given, find latest date.")
			
 
				+        submission_date = get_latest_date_for_country(country_code, submission_year)
			
 
				+    else:
			
 
				+        if verbose:
			
 
				+            print(f"Using given submissions date {submission_date}")
			
 
				+
			
 
				+    if submission_date is None:
			
 
				+        # there is no data. Raise an exception
			
 
				+        raise NoCRFFilesError(f"No submissions found for {country_code}, "
			
 
				+                              f"submission_year={submission_year}, "
			
 
				+                              f"date={date}")
			
 
				+    else:
			
 
				+        if verbose:
			
 
				+            print(f"Latest submission date for CRF{submission_year} is {submission_date}")
			
 
				+    country_info["date"] = submission_date
			
 
				+
			
 
				+    # get possible input files
			
 
				+    input_files = get_crf_files(country_codes=country_code,
			
 
				+                                submission_year=submission_year,
			
 
				+                                date=submission_date)
			
 
				+    if not input_files:
			
 
				+        raise NoCRFFilesError(f"No possible input files found for {country}, CRF{submission_year}, "
			
 
				+                              f"v{submission_date}. Are they already submitted and included in the "
			
 
				+                              f"repository?")
			
 
				+    elif verbose:
			
 
				+        print(f"Found the following input_files:")
			
 
				+        for file in input_files:
			
 
				+            print(file.name)
			
 
				+        print("")
			
 
				+
			
 
				+
			
 
				+    # convert file's path to str
			
 
				+    input_files = [file.as_posix() for file in input_files]
			
 
				+    country_info["input"] = input_files
			
 
				+
			
 
				+    # get output file
			
 
				+    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+    output_files = [output_folder / f"{country_code}_CRF{submission_year}"
			
 
				+                                    f"_{submission_date}.{suffix}" for suffix
			
 
				+                    in ['yaml', 'csv', 'nc']]
			
 
				+    if verbose:
			
 
				+        print(f"The following files are considered as output_files:")
			
 
				+        for file in output_files:
			
 
				+            print(file)
			
 
				+        print("")
			
 
				+
			
 
				+    # check if output data present
			
 
				+
			
 
				+    # convert file paths to str
			
 
				+    output_files = [file.as_posix() for file in output_files]
			
 
				+    country_info["output"] = output_files
			
 
				+
			
 
				+    return country_info
			
 
				+
			
 
				 
			
 
				 def get_country_name(
			
 
				         country_code: str,
			
@@ -511,4 +508,40 @@ def get_country_name(
 
				             raise ValueError(f"Country code {country_code} can not be mapped to "
			
 
				                              f"any country")
			
 
				 
			
 
				-    return country_name
			
 
				+    return country_name
			
 
				+
			
 
				+
			
 
				+def submission_has_been_read(
			
 
				+        country_code: str,
			
 
				+        country_name: str,
			
 
				+        submission_year: int,
			
 
				+        submission_date: str,
			
 
				+        verbose: Optional[bool]=True,
			
 
				+) -> bool:
			
 
				+    """
			
 
				+    Check if a CRF submission has already been read
			
 
				+    """
			
 
				+    output_folder = extracted_data_path / country_name.replace(" ", "_")
			
 
				+    output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
			
 
				+    if output_folder.exists():
			
 
				+        existing_files = output_folder.glob(f"{output_filename}.*")
			
 
				+        existing_suffixes = [file.suffix for file in existing_files]
			
 
				+        if all(suffix in existing_suffixes for suffix in [".nc", ".yaml", ".csv"]):
			
 
				+            has_been_read = True
			
 
				+            if verbose:
			
 
				+                print(f"Data already available for {country_code}, "
			
 
				+                      f"CRF{submission_year}, version {submission_date}.")
			
 
				+        else:
			
 
				+            has_been_read = False
			
 
				+            if verbose:
			
 
				+                print(f"Partial data available for {country_code}, "
			
 
				+                      f"CRF{submission_year}, version {submission_date}. "
			
 
				+                      "Please check if all files have been written after "
			
 
				+                      "reading.")
			
 
				+    else:
			
 
				+        has_been_read = False
			
 
				+        if verbose:
			
 
				+            print(f"No read data available for {country_code}, "
			
 
				+                  f"CRF{submission_year}, version {submission_date}. ")
			
 
				+
			
 
				+    return has_been_read
			
--- a/code/UNFCCC_CRF_reader/util.py
+++ b/code/UNFCCC_CRF_reader/util.py
@@ -28,4 +28,7 @@ all_crf_countries = [
 
				     'MLT', 'NLD', 'NOR', 'NZL', 'POL', # 40
			
 
				     'PRT', 'ROU', 'RUS', 'SVK', 'SVN',
			
 
				     'SWE', 'TUR', 'UKR', 'USA', # 49
			
 
				-]
			
 
				+]
			
 
				+
			
 
				+class NoCRFFilesError(Exception):
			
 
				+    pass
			
--- a/dodo.py
+++ b/dodo.py
@@ -145,7 +145,10 @@ def task_read_unfccc_submission():
 
				     """ Read submission for a country (if code exists) (not for CRF)"""
			
 
				     return {
			
 
				         'actions': [f"./venv/bin/python code/UNFCCC_reader/read_UNFCCC_submission.py "
			
 
				-                    f"--country={read_config['country']} --submission={read_config['submission']}"],
			
 
				+                    f"--country={read_config['country']} --submission={read_config['submission']}",
			
 
				+                    f"./venv/bin/python code/UNFCCC_reader/folder_mapping.py "
			
 
				+                    f"--folder=extracted_data/UNFCCC"
			
 
				+                    ],
			
 
				         'verbosity': 2,
			
 
				         'setup': ['setup_venv'],
			
 
				     }
			
--- a/extracted_data/UNFCCC/folder_mapping.json
+++ b/extracted_data/UNFCCC/folder_mapping.json
@@ -1,4 +1,6 @@
 
				 {
			
 
				-    "KOR": "Republic_of_Korea",
			
 
				+    "PRK": "Republic_of_Korea",
			
 
				+    "USA": "United_States",
			
 
				+    "DEU": "Germany",
			
 
				     "CHL": "Chile"
			
 
				 }