123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536 |
- #import re
- #
- #from treelib import Tree
- #import pandas as pd
- import xarray as xr
- import primap2 as pm2
- #import numpy as np
- #import pycountry
- import datalad.api
- from datetime import date
- #from pathlib import Path
- from typing import Optional, List, Dict, Union
- #from . import crf_specifications as crf
- import crf_specifications as crf
- from UNFCCC_CRF_reader_core import read_crf_table
- from UNFCCC_CRF_reader_core import convert_crf_table_to_pm2if
- from UNFCCC_CRF_reader_core import get_latest_date_for_country
- from UNFCCC_CRF_reader_core import get_crf_files
- from UNFCCC_CRF_reader_core import get_country_name
- from UNFCCC_CRF_reader_devel import save_unknown_categories_info
- from UNFCCC_CRF_reader_devel import save_last_row_info
- from util import code_path, log_path, \
- custom_country_mapping, extracted_data_path, root_path, \
- all_crf_countries, NoCRFFilesError
- import sys
- sys.path.append(code_path.name)
- from UNFCCC_reader.get_submissions_info import get_country_code
- # functions:
- # * testing fucntions
- # ** read one or more table(s) for all countries
- # (and a if desired only a single year) and write
- # output files with missing sectors etc
- # **
- # TODO: add function to read several / all countries
- # general approach:
- # main code in a function that reads on table from one file.
- # return raw pandas DF for use in different functions
- # wrappers around this function to read for a whole country or for test reading where we also
- # write files with missing sectors etc.
- # merging functions use native pm2 format
- def read_crf_for_country(
- country_code: str,
- submission_year: int,
- submission_date: Optional[str]=None,
- re_read: Optional[bool]=True,
- ) -> xr.Dataset:
- """
- Read CRF data for given submission year and country. All tables
- available in the specification will be read for all years. Result
- will be written to appropriate country folder.
- Folders are determined from the submission_year and country_code variables.
- The output is a primap2 dataset (xarray based).
- If you want to read data for more countries or from a different folder
- use the read_latest_crf_submissions_for_year or test_read_crf_data function.
- Currently there is no consistency check between data for the same category
- read from different tables
- We only save the data in the country folder if there were no messages like
- unknown rows to make sure that data that goes into the repository is complete.
- The result dataframe is returned in any case. In case log messages appeared
- they are saved in the folder 'log' under the file name
- 'country_reading_<country_code>_<date>_X.csv'.
- Parameters
- __________
- country_codes: str
- ISO 3-letter country code
- submission_year: int
- Year of the submission of the data
- submission_data: Optional(str)
- Read for a specific submission date (given as string as in the file names)
- If not specified latest data will be read
- re_read: Optional(bool) default: True
- Read the data also if it's already present
- Returns
- _______
- return value is a Pandas DataFrame with the read data in PRIMAP2 format
- """
- # get country name
- country_name = get_country_name(country_code)
- # get specification and available tables
- try:
- crf_spec = getattr(crf, f"CRF{submission_year}")
- #print(table_spec)
- except:
- raise ValueError(f"No terminology exists for submission year {submission_year}")
- tables = [table for table in crf_spec.keys()
- if crf_spec[table]["status"] == "tested"]
- print(f"The following tables are available in the " \
- f"CRF{submission_year} specification: {tables}")
- if submission_date is None:
- submission_date = get_latest_date_for_country(country_code, submission_year)
- # check if data has been read already
- read_data = not submission_has_been_read(
- country_code, country_name, submission_year=submission_year,
- submission_date=submission_date, verbose=True,
- )
- ds_all = None
- if read_data or re_read:
- unknown_categories = []
- last_row_info = []
- for table in tables:
- # read table for all years
- ds_table, new_unknown_categories, new_last_row_info = read_crf_table(
- country_code, table, submission_year, date=submission_date)#, data_year=[1990])
- # collect messages on unknown rows etc
- unknown_categories = unknown_categories + new_unknown_categories
- last_row_info = last_row_info + new_last_row_info
- # convert to PRIMAP2 IF
- # first drop the orig_cat_name col as it can have multiple values for
- # one category
- ds_table = ds_table.drop(columns=["orig_cat_name"])
- # if we need to map entities pass this info to the conversion function
- if "entity_mapping" in crf_spec[table]:
- entity_mapping = crf_spec[table]["entity_mapping"]
- else:
- entity_mapping = None
- ds_table_if = convert_crf_table_to_pm2if(
- ds_table,
- submission_year,
- meta_data_input={"title": f"Data submitted in {submission_year} to the UNFCCC "
- f"in the common reporting format (CRF) by {country_name}. "
- f"Submission date: {submission_date}"},
- entity_mapping=entity_mapping,
- )
- # now convert to native PRIMAP2 format
- ds_table_pm2 = pm2.pm2io.from_interchange_format(ds_table_if)
- # combine per table DS
- if ds_all is None:
- ds_all = ds_table_pm2
- else:
- ds_all = ds_all.combine_first(ds_table_pm2)
- # check if there were log messages.
- save_data = True
- if len(unknown_categories) > 0:
- save_data = False
- today = date.today()
- log_location = log_path / f"CRF{submission_year}" \
- / f"{country_code}_unknown_categories_{today.strftime('%Y-%m-%d')}.csv"
- print(f"Unknown rows found for {country_code}. Not saving data. Savin log to "
- f"{log_location}" )
- save_unknown_categories_info(unknown_categories, log_location)
- if len(last_row_info) > 0:
- save_data = False
- today = date.today()
- log_location = log_path / f"CRF{submission_year}" \
- / f"{country_code}_last_row_info_{today.strftime('%Y-%m-%d')}.csv"
- print(f"Data found in the last row found for {country_code}. Not saving data. Savin log to "
- f"{log_location}")
- save_last_row_info(last_row_info, log_location)
- if save_data:
- compression = dict(zlib=True, complevel=9)
- output_folder = extracted_data_path / country_name.replace(" ", "_")
- output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
- if not output_folder.exists():
- output_folder.mkdir()
- # write data in interchange format
- pm2.pm2io.write_interchange_format(output_folder / output_filename,
- ds_all.pr.to_interchange_format())
- # write data in native PRIMAP2 format
- encoding = {var: compression for var in ds_all.data_vars}
- ds_all.pr.to_netcdf(output_folder / (output_filename + ".nc"),
- encoding=encoding)
- return ds_all
- def read_crf_for_country_datalad(
- country: str,
- submission_year: int,
- submission_date: Optional[str]=None,
- re_read: Optional[bool]=True
- ) -> None:
- """
- Wrapper around read_crf_for_country which takes care of selecting input
- and output files and using datalad run to trigger the data reading
- Parameters
- __________
- country_codes: str
- ISO 3-letter country code
- submission_year: int
- Year of the submission of the data
- submission_date: Optional(str)
- Read for a specific submission date (given as string as in the file names)
- If not specified latest data will be read
- """
- # get all the info for the country
- country_info = get_input_and_output_files_for_country(
- country, submission_year=submission_year, verbose=True)
- print(f"Attempting to read data for CRF{submission_year} from {country}.")
- print("#"*80)
- print("")
- print(f"Using the UNFCCC_CRF_reader")
- print("")
- print(f"Run the script using datalad run via the python api")
- script = code_path / "UNFCCC_CRF_reader" / "read_UNFCCC_CRF_submission.py"
- cmd = f"./venv/bin/python3 {script.as_posix()} --country={country} "\
- f"--submission_year={submission_year} --submission_date={submission_date}"
- if re_read:
- cmd = cmd + f" --re_read"
- datalad.api.run(
- cmd=cmd,
- dataset=root_path,
- message=f"Read data for {country}, CRF{submission_year}, {submission_date}.",
- inputs=country_info["input"],
- outputs=country_info["output"],
- dry_run=None,
- explicit=True,
- )
- def read_new_crf_for_year(
- submission_year: int,
- countries: Optional[List[str]]=None,
- re_read: Optional[bool]=False,
- ) -> dict:
- """
- Read CRF data for given submission year for all countries in
- `countries` that have submitted data. If no `countries` list is
- given, all countries are used.
- When updated submission exist the latest will be read.
- All tables available in the specification will be read for all years.
- Results will be written to appropriate country folders.
- If you want to read data from a different folder use the
- test_read_crf_data function.
- Currently there is no consistency check between data for the same category
- read from different tables
- Parameters
- __________
- submission_year: int
- Year of the submission of the data
- countries: List[int] (optional)
- List of countries to read. If not given reading is tried for all
- CRF countries
- re_read: bool (optional, default=False)
- If true data will be read even if already read before.
- TODO: write log with failed countries and what has been read
- Returns
- _______
- list[str]: list with country codes for which the data has been read
- """
- if countries is None:
- countries = all_crf_countries
- read_countries = {}
- for country in countries:
- try:
- country_df = read_crf_for_country(country, submission_year, re_read=re_read)
- if country_df is None:
- read_countries[country] = "skipped"
- else:
- read_countries[country] = "read"
- except NoCRFFilesError:
- print(f"No data for country {country}, {submission_year}")
- read_countries[country] = "no data"
- except Exception as ex:
- print(f"Data for country {country}, {submission_year} could not be read")
- print(f"The following error occurred: {ex}")
- read_countries[country]= "failed"
- # print overview
- successful_countries = [country for country in read_countries if read_countries[country] == "read"]
- skipped_countries = [country for country in read_countries if read_countries[country] == "skipped"]
- failed_countries = [country for country in read_countries if read_countries[country] == "failed"]
- no_data_countries = [country for country in read_countries if read_countries[country] == "no data"]
- print(f"Read data for countries {successful_countries}")
- print(f"Skipped countries {skipped_countries}")
- print(f"No data for countries {no_data_countries}")
- print(f"!!!!! Reading failed for {failed_countries}. Check why")
- return(read_countries)
- def read_new_crf_for_year_datalad(
- submission_year: int,
- countries: Optional[List[str]] = None,
- re_read: Optional[bool] = False,
- ) -> None:
- """
- Wrapper around read_crf_for_year_datalad which takes care of selecting input
- and output files and using datalad run to trigger the data reading
- Parameters
- __________
- submission_year: int
- Year of the submission of the data
- countries: List[int] (optional)
- List of countries to read. If not given reading is tried for all
- CRF countries
- re_read: bool (optional, default=False)
- If true data will be read even if already read before.
- """
- if countries is not None:
- print(f"Reading CRF{submission_year} for countries {countries} using UNFCCC_CRF_reader.")
- else:
- print(f"Reading CRF{submission_year} for all countries using UNFCCC_CRF_reader.")
- countries = all_crf_countries
- print("#" * 80)
- print("")
- if re_read:
- print("Re-reading all latest submissions.")
- else:
- print("Only reading new submissions not read yet.")
- input_files = []
- output_files = []
- # loop over countries to collect input and output files
- print("Collect input and output files to pass to datalad")
- for country in countries:
- try:
- country_info = get_input_and_output_files_for_country(
- country, submission_year=submission_year, verbose=False)
- # check if the submission has been read already
- if re_read:
- input_files = input_files + country_info["input"]
- output_files = output_files + country_info["output"]
- else:
- data_read = submission_has_been_read(
- country_info["code"], country_info["name"],
- submission_year=submission_year,
- submission_date=country_info["date"],
- verbose=False,
- )
- if not data_read:
- input_files = input_files + country_info["input"]
- output_files = output_files + country_info["output"]
- except:
- # no error handling here as that is done in the function that does the actual reading
- pass
- print(f"Run the script using datalad run via the python api")
- script = code_path / "UNFCCC_CRF_reader" / "read_new_UNFCCC_CRF_for_year.py"
- #cmd = f"./venv/bin/python3 {script.as_posix()} --countries={countries} "\
- # f"--submission_year={submission_year}"
- cmd = f"./venv/bin/python3 {script.as_posix()} " \
- f"--submission_year={submission_year}"
- if re_read:
- cmd = cmd + " --re_read"
- datalad.api.run(
- cmd=cmd,
- dataset=root_path,
- message=f"Read data for {countries}, CRF{submission_year}. Re-reading: {re_read}",
- inputs=input_files,
- outputs=output_files,
- dry_run=None,
- #explicit=True,
- )
- # function to read all available data (or list of countries?)
- # make sure it works when not all countries have submitted data
- # give option to only read new data (no output yet), but also option to
- # read all data, e.g. when specifications have changed
- def get_input_and_output_files_for_country(
- country: str,
- submission_year: int,
- submission_date: Optional[str]=None,
- verbose: Optional[bool]=True,
- ) -> Dict[str, Union[List, str]]:
- """
- Get input and output files for a given country
- """
- country_info = {}
- if country in custom_country_mapping:
- country_code = country
- else:
- country_code = get_country_code(country)
- # now get the country name
- country_name = get_country_name(country_code)
- country_info["code"] = country_code
- country_info["name"] = country_name
- # determine latest data
- print(f"Determining input and output files for {country}")
- if submission_date is None:
- if verbose:
- print(f"No submission date given, find latest date.")
- submission_date = get_latest_date_for_country(country_code, submission_year)
- else:
- if verbose:
- print(f"Using given submissions date {submission_date}")
- if submission_date is None:
- # there is no data. Raise an exception
- raise NoCRFFilesError(f"No submissions found for {country_code}, "
- f"submission_year={submission_year}, "
- f"date={date}")
- else:
- if verbose:
- print(f"Latest submission date for CRF{submission_year} is {submission_date}")
- country_info["date"] = submission_date
- # get possible input files
- input_files = get_crf_files(country_codes=country_code,
- submission_year=submission_year,
- date=submission_date)
- if not input_files:
- raise NoCRFFilesError(f"No possible input files found for {country}, CRF{submission_year}, "
- f"v{submission_date}. Are they already submitted and included in the "
- f"repository?")
- elif verbose:
- print(f"Found the following input_files:")
- for file in input_files:
- print(file.name)
- print("")
- # convert file's path to str
- input_files = [file.as_posix() for file in input_files]
- country_info["input"] = input_files
- # get output file
- output_folder = extracted_data_path / country_name.replace(" ", "_")
- output_files = [output_folder / f"{country_code}_CRF{submission_year}"
- f"_{submission_date}.{suffix}" for suffix
- in ['yaml', 'csv', 'nc']]
- if verbose:
- print(f"The following files are considered as output_files:")
- for file in output_files:
- print(file)
- print("")
- # check if output data present
- # convert file paths to str
- output_files = [file.as_posix() for file in output_files]
- country_info["output"] = output_files
- return country_info
- def submission_has_been_read(
- country_code: str,
- country_name: str,
- submission_year: int,
- submission_date: str,
- verbose: Optional[bool]=True,
- ) -> bool:
- """
- Check if a CRF submission has already been read
- """
- output_folder = extracted_data_path / country_name.replace(" ", "_")
- output_filename = f"{country_code}_CRF{submission_year}_{submission_date}"
- if output_folder.exists():
- existing_files = output_folder.glob(f"{output_filename}.*")
- existing_suffixes = [file.suffix for file in existing_files]
- if all(suffix in existing_suffixes for suffix in [".nc", ".yaml", ".csv"]):
- has_been_read = True
- if verbose:
- print(f"Data already available for {country_code}, "
- f"CRF{submission_year}, version {submission_date}.")
- else:
- has_been_read = False
- if verbose:
- print(f"Partial data available for {country_code}, "
- f"CRF{submission_year}, version {submission_date}. "
- "Please check if all files have been written after "
- "reading.")
- else:
- has_been_read = False
- if verbose:
- print(f"No read data available for {country_code}, "
- f"CRF{submission_year}, version {submission_date}. ")
- return has_been_read