|
@@ -15,76 +15,84 @@ from .definitions import root_path, downloaded_data_path, extracted_data_path
|
|
from .definitions import legacy_data_path, code_path
|
|
from .definitions import legacy_data_path, code_path
|
|
from .definitions import GWP_factors
|
|
from .definitions import GWP_factors
|
|
|
|
|
|
|
|
+
|
|
def process_data_for_country(
|
|
def process_data_for_country(
|
|
- data_country: xr.Dataset,
|
|
|
|
- entities_to_ignore: List[str],
|
|
|
|
- gas_baskets: Dict[str, List[str]],
|
|
|
|
- filter_dims: Optional[Dict[str, List[str]]] = None,
|
|
|
|
- cat_terminology_out: Optional[str] = None,
|
|
|
|
- category_conversion: Dict[str, Dict] = None,
|
|
|
|
- sectors_out: List[str] = None,
|
|
|
|
- processing_info_country: Dict = None,
|
|
|
|
|
|
+ data_country: xr.Dataset,
|
|
|
|
+ entities_to_ignore: List[str],
|
|
|
|
+ gas_baskets: Dict[str, List[str]],
|
|
|
|
+ filter_dims: Optional[Dict[str, List[str]]] = None,
|
|
|
|
+ cat_terminology_out: Optional[str] = None,
|
|
|
|
+ category_conversion: Dict[str, Dict] = None,
|
|
|
|
+ sectors_out: List[str] = None,
|
|
|
|
+ processing_info_country: Dict = None,
|
|
) -> xr.Dataset:
|
|
) -> xr.Dataset:
|
|
"""
|
|
"""
|
|
- Process data from DI interface (where necessary).
|
|
|
|
- * Downscaling including subtraction of time series
|
|
|
|
- * country specific sector aggregation
|
|
|
|
- * Conversion to IPCC2006 categories
|
|
|
|
- * general sector and gas basket aggregation (in new categories)
|
|
|
|
|
|
+ Process data from DI interface (where necessary).
|
|
|
|
+ * Downscaling including subtraction of time series
|
|
|
|
+ * country specific sector aggregation
|
|
|
|
+ * Conversion to IPCC2006 categories
|
|
|
|
+ * general sector and gas basket aggregation (in new categories)
|
|
"""
|
|
"""
|
|
|
|
|
|
# 0: gather information
|
|
# 0: gather information
|
|
- countries = list(data_country.coords[data_country.attrs['area']].values)
|
|
|
|
|
|
+ countries = list(data_country.coords[data_country.attrs["area"]].values)
|
|
if len(countries) > 1:
|
|
if len(countries) > 1:
|
|
raise ValueError(
|
|
raise ValueError(
|
|
f"Found {len(countries)} countries. Only single country data "
|
|
f"Found {len(countries)} countries. Only single country data "
|
|
- f"can be processed by this function. countries: {countries}")
|
|
|
|
|
|
+ f"can be processed by this function. countries: {countries}"
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
country_code = countries[0]
|
|
country_code = countries[0]
|
|
|
|
|
|
# get category terminology
|
|
# get category terminology
|
|
- cat_col = data_country.attrs['cat']
|
|
|
|
- temp = re.findall(r'\((.*)\)', cat_col)
|
|
|
|
|
|
+ cat_col = data_country.attrs["cat"]
|
|
|
|
+ temp = re.findall(r"\((.*)\)", cat_col)
|
|
cat_terminology_in = temp[0]
|
|
cat_terminology_in = temp[0]
|
|
|
|
|
|
# get scenario
|
|
# get scenario
|
|
- scenarios = list(data_country.coords[data_country.attrs['scen']].values)
|
|
|
|
|
|
+ scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
|
|
if len(scenarios) > 1:
|
|
if len(scenarios) > 1:
|
|
raise ValueError(
|
|
raise ValueError(
|
|
f"Found {len(scenarios)} scenarios. Only single scenario data "
|
|
f"Found {len(scenarios)} scenarios. Only single scenario data "
|
|
- f"can be processed by this function. Scenarios: {scenarios}")
|
|
|
|
|
|
+ f"can be processed by this function. Scenarios: {scenarios}"
|
|
|
|
+ )
|
|
scenario = scenarios[0]
|
|
scenario = scenarios[0]
|
|
|
|
|
|
# get source
|
|
# get source
|
|
- sources = list(data_country.coords['source'].values)
|
|
|
|
|
|
+ sources = list(data_country.coords["source"].values)
|
|
if len(sources) > 1:
|
|
if len(sources) > 1:
|
|
raise ValueError(
|
|
raise ValueError(
|
|
f"Found {len(sources)} sources. Only single source data "
|
|
f"Found {len(sources)} sources. Only single source data "
|
|
- f"can be processed by this function. Sources: {sources}")
|
|
|
|
|
|
+ f"can be processed by this function. Sources: {sources}"
|
|
|
|
+ )
|
|
source = sources[0]
|
|
source = sources[0]
|
|
|
|
|
|
# check if category name column present
|
|
# check if category name column present
|
|
# TODO: replace 'name' in config by 'additional_cols' dict that defines the cols
|
|
# TODO: replace 'name' in config by 'additional_cols' dict that defines the cols
|
|
# and the values
|
|
# and the values
|
|
- if 'orig_cat_name' in data_country.coords:
|
|
|
|
|
|
+ if "orig_cat_name" in data_country.coords:
|
|
cat_name_present = True
|
|
cat_name_present = True
|
|
else:
|
|
else:
|
|
cat_name_present = False
|
|
cat_name_present = False
|
|
|
|
|
|
# 1: general processing
|
|
# 1: general processing
|
|
# remove unused cats
|
|
# remove unused cats
|
|
- data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
|
|
|
|
|
|
+ data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
|
|
# remove unused years
|
|
# remove unused years
|
|
- data_country = data_country.dropna(f'time', how='all')
|
|
|
|
|
|
+ data_country = data_country.dropna(f"time", how="all")
|
|
# remove variables only containing nan
|
|
# remove variables only containing nan
|
|
- nan_vars_country = [var for var in data_country.data_vars if
|
|
|
|
- bool(data_country[var].isnull().all().data) is True]
|
|
|
|
|
|
+ nan_vars_country = [
|
|
|
|
+ var
|
|
|
|
+ for var in data_country.data_vars
|
|
|
|
+ if bool(data_country[var].isnull().all().data) is True
|
|
|
|
+ ]
|
|
print(f"removing all-nan variables: {nan_vars_country}")
|
|
print(f"removing all-nan variables: {nan_vars_country}")
|
|
data_country = data_country.drop_vars(nan_vars_country)
|
|
data_country = data_country.drop_vars(nan_vars_country)
|
|
|
|
|
|
# remove unnecessary variables
|
|
# remove unnecessary variables
|
|
- entities_ignore_present = [entity for entity in entities_to_ignore if
|
|
|
|
- entity in data_country.data_vars]
|
|
|
|
|
|
+ entities_ignore_present = [
|
|
|
|
+ entity for entity in entities_to_ignore if entity in data_country.data_vars
|
|
|
|
+ ]
|
|
data_country = data_country.drop_vars(entities_ignore_present)
|
|
data_country = data_country.drop_vars(entities_ignore_present)
|
|
|
|
|
|
# filter ()
|
|
# filter ()
|
|
@@ -93,167 +101,200 @@ def process_data_for_country(
|
|
|
|
|
|
# 2: country specific processing
|
|
# 2: country specific processing
|
|
if processing_info_country is not None:
|
|
if processing_info_country is not None:
|
|
-
|
|
|
|
- if 'tolerance' in processing_info_country:
|
|
|
|
|
|
+ if "tolerance" in processing_info_country:
|
|
tolerance = processing_info_country["tolerance"]
|
|
tolerance = processing_info_country["tolerance"]
|
|
else:
|
|
else:
|
|
tolerance = 0.01
|
|
tolerance = 0.01
|
|
|
|
|
|
# remove entities if needed
|
|
# remove entities if needed
|
|
- if 'ignore_entities' in processing_info_country:
|
|
|
|
- entities_to_ignore_country = processing_info_country[
|
|
|
|
- 'ignore_entities']
|
|
|
|
- entities_ignore_present = \
|
|
|
|
- [entity for entity in entities_to_ignore_country if
|
|
|
|
- entity in data_country.data_vars]
|
|
|
|
|
|
+ if "ignore_entities" in processing_info_country:
|
|
|
|
+ entities_to_ignore_country = processing_info_country["ignore_entities"]
|
|
|
|
+ entities_ignore_present = [
|
|
|
|
+ entity
|
|
|
|
+ for entity in entities_to_ignore_country
|
|
|
|
+ if entity in data_country.data_vars
|
|
|
|
+ ]
|
|
data_country = data_country.drop_vars(entities_ignore_present)
|
|
data_country = data_country.drop_vars(entities_ignore_present)
|
|
|
|
|
|
# take only desired years
|
|
# take only desired years
|
|
- if 'years' in processing_info_country:
|
|
|
|
|
|
+ if "years" in processing_info_country:
|
|
data_country = data_country.pr.loc[
|
|
data_country = data_country.pr.loc[
|
|
- {'time': processing_info_country['years']}]
|
|
|
|
|
|
+ {"time": processing_info_country["years"]}
|
|
|
|
+ ]
|
|
|
|
|
|
# remove timeseries if desired
|
|
# remove timeseries if desired
|
|
- if 'remove_ts' in processing_info_country:
|
|
|
|
- for case in processing_info_country['remove_ts']:
|
|
|
|
- remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
|
|
|
|
|
|
+ if "remove_ts" in processing_info_country:
|
|
|
|
+ for case in processing_info_country["remove_ts"]:
|
|
|
|
+ remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
|
|
entities = remove_info.pop("entities")
|
|
entities = remove_info.pop("entities")
|
|
for entity in entities:
|
|
for entity in entities:
|
|
- data_country[entity].pr.loc[remove_info] = \
|
|
|
|
|
|
+ data_country[entity].pr.loc[remove_info] = (
|
|
data_country[entity].pr.loc[remove_info] * np.nan
|
|
data_country[entity].pr.loc[remove_info] * np.nan
|
|
|
|
+ )
|
|
|
|
|
|
# remove all data for given years if necessary
|
|
# remove all data for given years if necessary
|
|
- if 'remove_years' in processing_info_country:
|
|
|
|
|
|
+ if "remove_years" in processing_info_country:
|
|
data_country = data_country.drop_sel(
|
|
data_country = data_country.drop_sel(
|
|
- time=processing_info_country['remove_years'])
|
|
|
|
|
|
+ time=processing_info_country["remove_years"]
|
|
|
|
+ )
|
|
|
|
|
|
# subtract categories
|
|
# subtract categories
|
|
- if 'subtract_cats' in processing_info_country:
|
|
|
|
- subtract_cats_current = processing_info_country['subtract_cats']
|
|
|
|
|
|
+ if "subtract_cats" in processing_info_country:
|
|
|
|
+ subtract_cats_current = processing_info_country["subtract_cats"]
|
|
print(f"Subtracting categories for country {country_code}")
|
|
print(f"Subtracting categories for country {country_code}")
|
|
for cat_to_generate in subtract_cats_current:
|
|
for cat_to_generate in subtract_cats_current:
|
|
- if 'entities' in subtract_cats_current[cat_to_generate].keys():
|
|
|
|
- entities_current = subtract_cats_current[cat_to_generate]['entities']
|
|
|
|
|
|
+ if "entities" in subtract_cats_current[cat_to_generate].keys():
|
|
|
|
+ entities_current = subtract_cats_current[cat_to_generate][
|
|
|
|
+ "entities"
|
|
|
|
+ ]
|
|
else:
|
|
else:
|
|
entities_current = list(data_country.data_vars)
|
|
entities_current = list(data_country.data_vars)
|
|
|
|
|
|
- cats_to_subtract = \
|
|
|
|
- subtract_cats_current[cat_to_generate]['subtract']
|
|
|
|
- data_sub = \
|
|
|
|
- data_country[entities_current].pr.loc[
|
|
|
|
- {'category': cats_to_subtract}].pr.sum(
|
|
|
|
- dim='category', skipna=True, min_count=1)
|
|
|
|
|
|
+ cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
|
|
|
|
+ data_sub = (
|
|
|
|
+ data_country[entities_current]
|
|
|
|
+ .pr.loc[{"category": cats_to_subtract}]
|
|
|
|
+ .pr.sum(dim="category", skipna=True, min_count=1)
|
|
|
|
+ )
|
|
data_parent = data_country[entities_current].pr.loc[
|
|
data_parent = data_country[entities_current].pr.loc[
|
|
- {'category': subtract_cats_current[cat_to_generate]['parent']}]
|
|
|
|
|
|
+ {"category": subtract_cats_current[cat_to_generate]["parent"]}
|
|
|
|
+ ]
|
|
data_agg = data_parent - data_sub
|
|
data_agg = data_parent - data_sub
|
|
- nan_vars = [var for var in data_agg.data_vars if
|
|
|
|
- data_agg[var].isnull().all().data is True]
|
|
|
|
|
|
+ nan_vars = [
|
|
|
|
+ var
|
|
|
|
+ for var in data_agg.data_vars
|
|
|
|
+ if data_agg[var].isnull().all().data is True
|
|
|
|
+ ]
|
|
data_agg = data_agg.drop(nan_vars)
|
|
data_agg = data_agg.drop(nan_vars)
|
|
if len(data_agg.data_vars) > 0:
|
|
if len(data_agg.data_vars) > 0:
|
|
print(f"Generating {cat_to_generate} through subtraction")
|
|
print(f"Generating {cat_to_generate} through subtraction")
|
|
- data_agg = data_agg.expand_dims([f'category ('
|
|
|
|
- f'{cat_terminology_in})'])
|
|
|
|
|
|
+ data_agg = data_agg.expand_dims(
|
|
|
|
+ [f"category (" f"{cat_terminology_in})"]
|
|
|
|
+ )
|
|
|
|
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={f'category ({cat_terminology_in})':
|
|
|
|
- (f'category ({cat_terminology_in})',
|
|
|
|
- [cat_to_generate])})
|
|
|
|
|
|
+ coords={
|
|
|
|
+ f"category ({cat_terminology_in})": (
|
|
|
|
+ f"category ({cat_terminology_in})",
|
|
|
|
+ [cat_to_generate],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
if cat_name_present:
|
|
if cat_name_present:
|
|
- cat_name = subtract_cats_current[cat_to_generate]['name']
|
|
|
|
|
|
+ cat_name = subtract_cats_current[cat_to_generate]["name"]
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={'orig_cat_name':
|
|
|
|
- (f'category ({cat_terminology_in})',
|
|
|
|
- [cat_name])})
|
|
|
|
- data_country = data_country.pr.merge(data_agg,
|
|
|
|
- tolerance=tolerance)
|
|
|
|
|
|
+ coords={
|
|
|
|
+ "orig_cat_name": (
|
|
|
|
+ f"category ({cat_terminology_in})",
|
|
|
|
+ [cat_name],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
+ data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
|
|
else:
|
|
else:
|
|
print(f"no data to generate category {cat_to_generate}")
|
|
print(f"no data to generate category {cat_to_generate}")
|
|
|
|
|
|
# downscaling
|
|
# downscaling
|
|
- if 'downscale' in processing_info_country:
|
|
|
|
- if 'sectors' in processing_info_country['downscale']:
|
|
|
|
- sector_downscaling = \
|
|
|
|
- processing_info_country['downscale']['sectors']
|
|
|
|
|
|
+ if "downscale" in processing_info_country:
|
|
|
|
+ if "sectors" in processing_info_country["downscale"]:
|
|
|
|
+ sector_downscaling = processing_info_country["downscale"]["sectors"]
|
|
for case in sector_downscaling.keys():
|
|
for case in sector_downscaling.keys():
|
|
print(f"Downscaling for {case}.")
|
|
print(f"Downscaling for {case}.")
|
|
sector_downscaling_current = sector_downscaling[case]
|
|
sector_downscaling_current = sector_downscaling[case]
|
|
- entities = sector_downscaling_current.pop('entities')
|
|
|
|
|
|
+ entities = sector_downscaling_current.pop("entities")
|
|
for entity in entities:
|
|
for entity in entities:
|
|
data_country[entity] = data_country[
|
|
data_country[entity] = data_country[
|
|
- entity].pr.downscale_timeseries(
|
|
|
|
- **sector_downscaling_current)
|
|
|
|
|
|
+ entity
|
|
|
|
+ ].pr.downscale_timeseries(**sector_downscaling_current)
|
|
# , skipna_evaluation_dims=None)
|
|
# , skipna_evaluation_dims=None)
|
|
|
|
|
|
- if 'entities' in processing_info_country['downscale']:
|
|
|
|
- entity_downscaling = \
|
|
|
|
- processing_info_country['downscale']['entities']
|
|
|
|
|
|
+ if "entities" in processing_info_country["downscale"]:
|
|
|
|
+ entity_downscaling = processing_info_country["downscale"]["entities"]
|
|
for case in entity_downscaling.keys():
|
|
for case in entity_downscaling.keys():
|
|
print(f"Downscaling for {case}.")
|
|
print(f"Downscaling for {case}.")
|
|
# print(data_country.coords[f'category ('
|
|
# print(data_country.coords[f'category ('
|
|
# f'{cat_terminology_in})'].values)
|
|
# f'{cat_terminology_in})'].values)
|
|
data_country = data_country.pr.downscale_gas_timeseries(
|
|
data_country = data_country.pr.downscale_gas_timeseries(
|
|
- **entity_downscaling[case], skipna=True,
|
|
|
|
- skipna_evaluation_dims=None)
|
|
|
|
|
|
+ **entity_downscaling[case],
|
|
|
|
+ skipna=True,
|
|
|
|
+ skipna_evaluation_dims=None,
|
|
|
|
+ )
|
|
|
|
|
|
# aggregate categories
|
|
# aggregate categories
|
|
- if 'aggregate_cats' in processing_info_country:
|
|
|
|
- if 'agg_tolerance' in processing_info_country:
|
|
|
|
- agg_tolerance = processing_info_country['agg_tolerance']
|
|
|
|
|
|
+ if "aggregate_cats" in processing_info_country:
|
|
|
|
+ if "agg_tolerance" in processing_info_country:
|
|
|
|
+ agg_tolerance = processing_info_country["agg_tolerance"]
|
|
else:
|
|
else:
|
|
agg_tolerance = tolerance
|
|
agg_tolerance = tolerance
|
|
- aggregate_cats_current = processing_info_country['aggregate_cats']
|
|
|
|
|
|
+ aggregate_cats_current = processing_info_country["aggregate_cats"]
|
|
print(
|
|
print(
|
|
f"Aggregating categories for country {country_code}, source {source}, "
|
|
f"Aggregating categories for country {country_code}, source {source}, "
|
|
- f"scenario {scenario}")
|
|
|
|
|
|
+ f"scenario {scenario}"
|
|
|
|
+ )
|
|
for cat_to_agg in aggregate_cats_current:
|
|
for cat_to_agg in aggregate_cats_current:
|
|
print(f"Category: {cat_to_agg}")
|
|
print(f"Category: {cat_to_agg}")
|
|
- source_cats = aggregate_cats_current[cat_to_agg]['sources']
|
|
|
|
- data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
|
|
|
|
- dim='category', skipna=True, min_count=1)
|
|
|
|
- nan_vars = [var for var in data_agg.data_vars if
|
|
|
|
- data_agg[var].isnull().all().data is True]
|
|
|
|
|
|
+ source_cats = aggregate_cats_current[cat_to_agg]["sources"]
|
|
|
|
+ data_agg = data_country.pr.loc[{"category": source_cats}].pr.sum(
|
|
|
|
+ dim="category", skipna=True, min_count=1
|
|
|
|
+ )
|
|
|
|
+ nan_vars = [
|
|
|
|
+ var
|
|
|
|
+ for var in data_agg.data_vars
|
|
|
|
+ if data_agg[var].isnull().all().data is True
|
|
|
|
+ ]
|
|
data_agg = data_agg.drop(nan_vars)
|
|
data_agg = data_agg.drop(nan_vars)
|
|
if len(data_agg.data_vars) > 0:
|
|
if len(data_agg.data_vars) > 0:
|
|
- data_agg = data_agg.expand_dims([f'category ('
|
|
|
|
- f'{cat_terminology_in})'])
|
|
|
|
|
|
+ data_agg = data_agg.expand_dims(
|
|
|
|
+ [f"category (" f"{cat_terminology_in})"]
|
|
|
|
+ )
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={f'category ({cat_terminology_in})':
|
|
|
|
- (f'category ({cat_terminology_in})',
|
|
|
|
- [cat_to_agg])})
|
|
|
|
|
|
+ coords={
|
|
|
|
+ f"category ({cat_terminology_in})": (
|
|
|
|
+ f"category ({cat_terminology_in})",
|
|
|
|
+ [cat_to_agg],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
if cat_name_present:
|
|
if cat_name_present:
|
|
- cat_name = aggregate_cats_current[cat_to_agg]['name']
|
|
|
|
|
|
+ cat_name = aggregate_cats_current[cat_to_agg]["name"]
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={'orig_cat_name':
|
|
|
|
- (f'category ({cat_terminology_in})',
|
|
|
|
- [cat_name])})
|
|
|
|
- data_country = data_country.pr.merge(data_agg,
|
|
|
|
- tolerance=agg_tolerance)
|
|
|
|
|
|
+ coords={
|
|
|
|
+ "orig_cat_name": (
|
|
|
|
+ f"category ({cat_terminology_in})",
|
|
|
|
+ [cat_name],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
+ data_country = data_country.pr.merge(
|
|
|
|
+ data_agg, tolerance=agg_tolerance
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
print(f"no data to aggregate category {cat_to_agg}")
|
|
print(f"no data to aggregate category {cat_to_agg}")
|
|
|
|
|
|
# copy HFCs and PFCs with default factors
|
|
# copy HFCs and PFCs with default factors
|
|
- if 'basket_copy' in processing_info_country:
|
|
|
|
|
|
+ if "basket_copy" in processing_info_country:
|
|
GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
|
|
GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
|
|
entities = processing_info_country["basket_copy"]["entities"]
|
|
entities = processing_info_country["basket_copy"]["entities"]
|
|
source_GWP = processing_info_country["basket_copy"]["source_GWP"]
|
|
source_GWP = processing_info_country["basket_copy"]["source_GWP"]
|
|
for entity in entities:
|
|
for entity in entities:
|
|
- data_source = data_country[f'{entity} ({source_GWP})']
|
|
|
|
|
|
+ data_source = data_country[f"{entity} ({source_GWP})"]
|
|
for GWP in GWPs_to_add:
|
|
for GWP in GWPs_to_add:
|
|
- data_GWP = data_source * \
|
|
|
|
- GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
|
|
|
|
|
|
+ data_GWP = (
|
|
|
|
+ data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
|
|
|
|
+ )
|
|
data_GWP.attrs["entity"] = entity
|
|
data_GWP.attrs["entity"] = entity
|
|
data_GWP.attrs["gwp_context"] = GWP
|
|
data_GWP.attrs["gwp_context"] = GWP
|
|
data_country[f"{entity} ({GWP})"] = data_GWP
|
|
data_country[f"{entity} ({GWP})"] = data_GWP
|
|
|
|
|
|
# aggregate gases if desired
|
|
# aggregate gases if desired
|
|
- if 'aggregate_gases' in processing_info_country:
|
|
|
|
|
|
+ if "aggregate_gases" in processing_info_country:
|
|
# TODO: why use different code here than below. Can this fill non-existen
|
|
# TODO: why use different code here than below. Can this fill non-existen
|
|
# gas baskets?
|
|
# gas baskets?
|
|
- for case in processing_info_country['aggregate_gases'].keys():
|
|
|
|
- case_info = processing_info_country['aggregate_gases'][case]
|
|
|
|
- data_country[case_info['basket']] = \
|
|
|
|
- data_country.pr.fill_na_gas_basket_from_contents(
|
|
|
|
- **case_info)
|
|
|
|
|
|
+ for case in processing_info_country["aggregate_gases"].keys():
|
|
|
|
+ case_info = processing_info_country["aggregate_gases"][case]
|
|
|
|
+ data_country[
|
|
|
|
+ case_info["basket"]
|
|
|
|
+ ] = data_country.pr.fill_na_gas_basket_from_contents(**case_info)
|
|
|
|
|
|
# 3: map categories
|
|
# 3: map categories
|
|
if category_conversion is not None:
|
|
if category_conversion is not None:
|
|
@@ -270,61 +311,74 @@ def process_data_for_country(
|
|
# more general processing
|
|
# more general processing
|
|
# reduce categories to output cats
|
|
# reduce categories to output cats
|
|
if sectors_out is not None:
|
|
if sectors_out is not None:
|
|
- cats_to_keep = [cat for cat in
|
|
|
|
- data_country.coords[f'category ({cat_terminology_out})'].values
|
|
|
|
- if cat in sectors_out]
|
|
|
|
- data_country = data_country.pr.loc[{'category': cats_to_keep}]
|
|
|
|
|
|
+ cats_to_keep = [
|
|
|
|
+ cat
|
|
|
|
+ for cat in data_country.coords[f"category ({cat_terminology_out})"].values
|
|
|
|
+ if cat in sectors_out
|
|
|
|
+ ]
|
|
|
|
+ data_country = data_country.pr.loc[{"category": cats_to_keep}]
|
|
|
|
|
|
# create gas baskets
|
|
# create gas baskets
|
|
entities_present = set(data_country.data_vars)
|
|
entities_present = set(data_country.data_vars)
|
|
for basket in gas_baskets.keys():
|
|
for basket in gas_baskets.keys():
|
|
- basket_contents_present = [gas for gas in gas_baskets[basket] if
|
|
|
|
- gas in entities_present]
|
|
|
|
|
|
+ basket_contents_present = [
|
|
|
|
+ gas for gas in gas_baskets[basket] if gas in entities_present
|
|
|
|
+ ]
|
|
if len(basket_contents_present) > 0:
|
|
if len(basket_contents_present) > 0:
|
|
if basket in list(data_country.data_vars):
|
|
if basket in list(data_country.data_vars):
|
|
data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
|
|
data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
|
|
- basket=basket, basket_contents=basket_contents_present,
|
|
|
|
- skipna=True, min_count=1)
|
|
|
|
|
|
+ basket=basket,
|
|
|
|
+ basket_contents=basket_contents_present,
|
|
|
|
+ skipna=True,
|
|
|
|
+ min_count=1,
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
try:
|
|
try:
|
|
- #print(data_country.data_vars)
|
|
|
|
- data_country[basket] = xr.full_like(data_country["CO2"],
|
|
|
|
- np.nan).pr.quantify(
|
|
|
|
- units="Gg CO2 / year")
|
|
|
|
- data_country[basket].attrs = {"entity": basket.split(' ')[0],
|
|
|
|
- "gwp_context": basket.split(' ')[1][
|
|
|
|
- 1:-1]}
|
|
|
|
|
|
+ # print(data_country.data_vars)
|
|
|
|
+ data_country[basket] = xr.full_like(
|
|
|
|
+ data_country["CO2"], np.nan
|
|
|
|
+ ).pr.quantify(units="Gg CO2 / year")
|
|
|
|
+ data_country[basket].attrs = {
|
|
|
|
+ "entity": basket.split(" ")[0],
|
|
|
|
+ "gwp_context": basket.split(" ")[1][1:-1],
|
|
|
|
+ }
|
|
data_country[basket] = data_country.pr.gas_basket_contents_sum(
|
|
data_country[basket] = data_country.pr.gas_basket_contents_sum(
|
|
- basket=basket, basket_contents=basket_contents_present,
|
|
|
|
- min_count=1)
|
|
|
|
|
|
+ basket=basket,
|
|
|
|
+ basket_contents=basket_contents_present,
|
|
|
|
+ min_count=1,
|
|
|
|
+ )
|
|
entities_present.add(basket)
|
|
entities_present.add(basket)
|
|
except Exception as ex:
|
|
except Exception as ex:
|
|
- print(f"No gas basket created for {country_code}, {source}, "
|
|
|
|
- f"{scenario}: {ex}")
|
|
|
|
|
|
+ print(
|
|
|
|
+ f"No gas basket created for {country_code}, {source}, "
|
|
|
|
+ f"{scenario}: {ex}"
|
|
|
|
+ )
|
|
|
|
|
|
# amend title and comment
|
|
# amend title and comment
|
|
- data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
|
|
|
|
- f"{date.today()}"
|
|
|
|
- data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
|
|
|
|
- f"{date.today()}"
|
|
|
|
|
|
+ data_country.attrs["comment"] = (
|
|
|
|
+ data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
|
|
|
|
+ )
|
|
|
|
+ data_country.attrs["title"] = (
|
|
|
|
+ data_country.attrs["title"] + f" Processed on " f"{date.today()}"
|
|
|
|
+ )
|
|
|
|
|
|
return data_country
|
|
return data_country
|
|
|
|
|
|
|
|
|
|
def convert_categories(
|
|
def convert_categories(
|
|
- ds_input: xr.Dataset,
|
|
|
|
- conversion: Dict[str, Dict[str, str]],
|
|
|
|
- #terminology_from: str,
|
|
|
|
- terminology_to: str,
|
|
|
|
- debug: bool=False,
|
|
|
|
- tolerance: float=0.01,
|
|
|
|
-)->xr.Dataset:
|
|
|
|
|
|
+ ds_input: xr.Dataset,
|
|
|
|
+ conversion: Dict[str, Dict[str, str]],
|
|
|
|
+ # terminology_from: str,
|
|
|
|
+ terminology_to: str,
|
|
|
|
+ debug: bool = False,
|
|
|
|
+ tolerance: float = 0.01,
|
|
|
|
+) -> xr.Dataset:
|
|
"""
|
|
"""
|
|
convert data from one category terminology to another
|
|
convert data from one category terminology to another
|
|
"""
|
|
"""
|
|
print(f"converting categories to {terminology_to}")
|
|
print(f"converting categories to {terminology_to}")
|
|
|
|
|
|
- if 'orig_cat_name' in ds_input.coords:
|
|
|
|
|
|
+ if "orig_cat_name" in ds_input.coords:
|
|
cat_name_present = True
|
|
cat_name_present = True
|
|
else:
|
|
else:
|
|
cat_name_present = False
|
|
cat_name_present = False
|
|
@@ -338,50 +392,67 @@ def convert_categories(
|
|
ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
|
|
ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
|
|
|
|
|
|
# find categories present in dataset
|
|
# find categories present in dataset
|
|
- cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
|
|
|
|
|
|
+ cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
|
|
|
|
|
|
# restrict categories and map category names
|
|
# restrict categories and map category names
|
|
- if 'mapping' in conversion.keys():
|
|
|
|
- mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
|
|
|
|
- cat in cats_present]
|
|
|
|
- ds_converted = ds_converted.pr.loc[
|
|
|
|
- {'category': mapping_cats_present}]
|
|
|
|
-
|
|
|
|
- from_cats = ds_converted.coords[f'category ({terminology_to})'].values
|
|
|
|
- to_cats = pd.Series(from_cats).replace(conversion['mapping'])
|
|
|
|
- ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
|
|
|
|
- (f'category ({terminology_to})',
|
|
|
|
- to_cats)})
|
|
|
|
|
|
+ if "mapping" in conversion.keys():
|
|
|
|
+ mapping_cats_present = [
|
|
|
|
+ cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
|
|
|
|
+ ]
|
|
|
|
+ ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
|
|
|
|
+
|
|
|
|
+ from_cats = ds_converted.coords[f"category ({terminology_to})"].values
|
|
|
|
+ to_cats = pd.Series(from_cats).replace(conversion["mapping"])
|
|
|
|
+ ds_converted = ds_converted.assign_coords(
|
|
|
|
+ {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
|
|
|
|
+ )
|
|
|
|
|
|
# redo the list of present cats after mapping, as we have new categories in the
|
|
# redo the list of present cats after mapping, as we have new categories in the
|
|
# target terminology now
|
|
# target terminology now
|
|
- cats_present_mapped = list(ds_converted.coords[f'category ('
|
|
|
|
- f'{terminology_to})'].values)
|
|
|
|
|
|
+ cats_present_mapped = list(
|
|
|
|
+ ds_converted.coords[f"category (" f"{terminology_to})"].values
|
|
|
|
+ )
|
|
# aggregate categories
|
|
# aggregate categories
|
|
- if 'aggregate' in conversion:
|
|
|
|
- aggregate_cats = conversion['aggregate']
|
|
|
|
|
|
+ if "aggregate" in conversion:
|
|
|
|
+ aggregate_cats = conversion["aggregate"]
|
|
for cat_to_agg in aggregate_cats:
|
|
for cat_to_agg in aggregate_cats:
|
|
if debug:
|
|
if debug:
|
|
print(f"Category: {cat_to_agg}")
|
|
print(f"Category: {cat_to_agg}")
|
|
- source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
|
|
|
|
- cat in cats_present_mapped]
|
|
|
|
|
|
+ source_cats = [
|
|
|
|
+ cat
|
|
|
|
+ for cat in aggregate_cats[cat_to_agg]["sources"]
|
|
|
|
+ if cat in cats_present_mapped
|
|
|
|
+ ]
|
|
if debug:
|
|
if debug:
|
|
print(source_cats)
|
|
print(source_cats)
|
|
- data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
|
|
|
|
- dim='category', skipna=True, min_count=1)
|
|
|
|
- nan_vars = [var for var in data_agg.data_vars if
|
|
|
|
- data_agg[var].isnull().all().data == True]
|
|
|
|
|
|
+ data_agg = ds_converted.pr.loc[{"category": source_cats}].pr.sum(
|
|
|
|
+ dim="category", skipna=True, min_count=1
|
|
|
|
+ )
|
|
|
|
+ nan_vars = [
|
|
|
|
+ var
|
|
|
|
+ for var in data_agg.data_vars
|
|
|
|
+ if data_agg[var].isnull().all().data == True
|
|
|
|
+ ]
|
|
data_agg = data_agg.drop(nan_vars)
|
|
data_agg = data_agg.drop(nan_vars)
|
|
if len(data_agg.data_vars) > 0:
|
|
if len(data_agg.data_vars) > 0:
|
|
- data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
|
|
|
|
|
|
+ data_agg = data_agg.expand_dims([f"category ({terminology_to})"])
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={f'category ({terminology_to})':
|
|
|
|
- (f'category ({terminology_to})', [cat_to_agg])})
|
|
|
|
|
|
+ coords={
|
|
|
|
+ f"category ({terminology_to})": (
|
|
|
|
+ f"category ({terminology_to})",
|
|
|
|
+ [cat_to_agg],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
if cat_name_present:
|
|
if cat_name_present:
|
|
data_agg = data_agg.assign_coords(
|
|
data_agg = data_agg.assign_coords(
|
|
- coords={'orig_cat_name':
|
|
|
|
- (f'category ({terminology_to})',
|
|
|
|
- [aggregate_cats[cat_to_agg]['name']])})
|
|
|
|
|
|
+ coords={
|
|
|
|
+ "orig_cat_name": (
|
|
|
|
+ f"category ({terminology_to})",
|
|
|
|
+ [aggregate_cats[cat_to_agg]["name"]],
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ )
|
|
ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
|
|
ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
|
|
cats_present_mapped.append(cat_to_agg)
|
|
cats_present_mapped.append(cat_to_agg)
|
|
else:
|
|
else:
|
|
@@ -391,9 +462,9 @@ def convert_categories(
|
|
|
|
|
|
|
|
|
|
def get_country_name(
|
|
def get_country_name(
|
|
- country_code: str,
|
|
|
|
|
|
+ country_code: str,
|
|
) -> str:
|
|
) -> str:
|
|
- """get country name from code """
|
|
|
|
|
|
+ """get country name from code"""
|
|
if country_code in custom_country_mapping:
|
|
if country_code in custom_country_mapping:
|
|
country_name = custom_country_mapping[country_code]
|
|
country_name = custom_country_mapping[country_code]
|
|
else:
|
|
else:
|
|
@@ -401,15 +472,16 @@ def get_country_name(
|
|
country = pycountry.countries.get(alpha_3=country_code)
|
|
country = pycountry.countries.get(alpha_3=country_code)
|
|
country_name = country.name
|
|
country_name = country.name
|
|
except:
|
|
except:
|
|
- raise ValueError(f"Country code {country_code} can not be mapped to "
|
|
|
|
- f"any country")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"Country code {country_code} can not be mapped to " f"any country"
|
|
|
|
+ )
|
|
|
|
|
|
return country_name
|
|
return country_name
|
|
|
|
|
|
|
|
|
|
def get_country_code(
|
|
def get_country_code(
|
|
- country_name: str,
|
|
|
|
-)->str:
|
|
|
|
|
|
+ country_name: str,
|
|
|
|
+) -> str:
|
|
"""
|
|
"""
|
|
obtain country code. If the input is a code it will be returned,
|
|
obtain country code. If the input is a code it will be returned,
|
|
if the input
|
|
if the input
|
|
@@ -435,28 +507,31 @@ def get_country_code(
|
|
country_code = country.alpha_3
|
|
country_code = country.alpha_3
|
|
except:
|
|
except:
|
|
try:
|
|
try:
|
|
- country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
|
|
|
|
|
|
+ country = pycountry.countries.search_fuzzy(
|
|
|
|
+ country_name.replace("_", " ")
|
|
|
|
+ )
|
|
except:
|
|
except:
|
|
- raise ValueError(f"Country name {country_name} can not be mapped to "
|
|
|
|
- f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"Country name {country_name} can not be mapped to "
|
|
|
|
+ f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly."
|
|
|
|
+ )
|
|
if len(country) > 1:
|
|
if len(country) > 1:
|
|
country_code = None
|
|
country_code = None
|
|
for current_country in country:
|
|
for current_country in country:
|
|
if current_country.name == country_name:
|
|
if current_country.name == country_name:
|
|
country_code = current_country.alpha_3
|
|
country_code = current_country.alpha_3
|
|
if country_code is None:
|
|
if country_code is None:
|
|
- raise ValueError(f"Country name {country_name} has {len(country)} "
|
|
|
|
- f"possible results for country codes.")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"Country name {country_name} has {len(country)} "
|
|
|
|
+ f"possible results for country codes."
|
|
|
|
+ )
|
|
|
|
|
|
country_code = country[0].alpha_3
|
|
country_code = country[0].alpha_3
|
|
|
|
|
|
return country_code
|
|
return country_code
|
|
|
|
|
|
|
|
|
|
-def create_folder_mapping(
|
|
|
|
- folder: str,
|
|
|
|
- extracted: bool = False
|
|
|
|
-) -> None:
|
|
|
|
|
|
+def create_folder_mapping(folder: str, extracted: bool = False) -> None:
|
|
"""
|
|
"""
|
|
Create a mapping from 3 letter ISO country codes to folders
|
|
Create a mapping from 3 letter ISO country codes to folders
|
|
based on the subfolders of the given folder. The mapping is
|
|
based on the subfolders of the given folder. The mapping is
|
|
@@ -480,9 +555,9 @@ def create_folder_mapping(
|
|
|
|
|
|
folder = root_path / folder
|
|
folder = root_path / folder
|
|
folder_mapping = {}
|
|
folder_mapping = {}
|
|
- #if not extracted:
|
|
|
|
|
|
+ # if not extracted:
|
|
known_folders = custom_folders
|
|
known_folders = custom_folders
|
|
- #else:
|
|
|
|
|
|
+ # else:
|
|
# known_folders = {}
|
|
# known_folders = {}
|
|
|
|
|
|
for item in folder.iterdir():
|
|
for item in folder.iterdir():
|
|
@@ -491,7 +566,9 @@ def create_folder_mapping(
|
|
ISO3 = known_folders[item.name]
|
|
ISO3 = known_folders[item.name]
|
|
else:
|
|
else:
|
|
try:
|
|
try:
|
|
- country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
|
|
|
|
|
|
+ country = pycountry.countries.search_fuzzy(
|
|
|
|
+ item.name.replace("_", " ")
|
|
|
|
+ )
|
|
if len(country) > 1:
|
|
if len(country) > 1:
|
|
ISO3 = None
|
|
ISO3 = None
|
|
for current_country in country:
|
|
for current_country in country:
|
|
@@ -516,8 +593,8 @@ def create_folder_mapping(
|
|
|
|
|
|
# TODO add crf
|
|
# TODO add crf
|
|
def get_country_submissions(
|
|
def get_country_submissions(
|
|
- country_name: str,
|
|
|
|
- print_sub: bool = True,
|
|
|
|
|
|
+ country_name: str,
|
|
|
|
+ print_sub: bool = True,
|
|
) -> Dict[str, List[str]]:
|
|
) -> Dict[str, List[str]]:
|
|
"""
|
|
"""
|
|
Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
|
|
Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
|
|
@@ -585,8 +662,8 @@ def get_country_submissions(
|
|
|
|
|
|
|
|
|
|
def get_country_datasets(
|
|
def get_country_datasets(
|
|
- country_name: str,
|
|
|
|
- print_ds: bool = True,
|
|
|
|
|
|
+ country_name: str,
|
|
|
|
+ print_ds: bool = True,
|
|
) -> Dict[str, List[str]]:
|
|
) -> Dict[str, List[str]]:
|
|
"""
|
|
"""
|
|
Input is a three letter ISO code for a country, or the country's name.
|
|
Input is a three letter ISO code for a country, or the country's name.
|
|
@@ -638,35 +715,42 @@ def get_country_datasets(
|
|
else:
|
|
else:
|
|
country_folder = folder_mapping[country_code]
|
|
country_folder = folder_mapping[country_code]
|
|
if not isinstance(country_folder, str):
|
|
if not isinstance(country_folder, str):
|
|
- raise ValueError("Wrong data type in folder mapping json file. Should be str.")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ "Wrong data type in folder mapping json file. Should be str."
|
|
|
|
+ )
|
|
|
|
|
|
datasets_current_folder = {}
|
|
datasets_current_folder = {}
|
|
current_folder = item / country_folder
|
|
current_folder = item / country_folder
|
|
|
|
|
|
for data_file in current_folder.iterdir():
|
|
for data_file in current_folder.iterdir():
|
|
- if data_file.suffix in ['.nc', '.yaml', '.csv']:
|
|
|
|
|
|
+ if data_file.suffix in [".nc", ".yaml", ".csv"]:
|
|
if data_file.stem in datasets_current_folder:
|
|
if data_file.stem in datasets_current_folder:
|
|
- datasets_current_folder[data_file.stem].append(data_file.suffix)
|
|
|
|
|
|
+ datasets_current_folder[data_file.stem].append(
|
|
|
|
+ data_file.suffix
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
datasets_current_folder[data_file.stem] = [data_file.suffix]
|
|
datasets_current_folder[data_file.stem] = [data_file.suffix]
|
|
|
|
|
|
for dataset in datasets_current_folder:
|
|
for dataset in datasets_current_folder:
|
|
# process filename to get submission
|
|
# process filename to get submission
|
|
- parts = dataset.split('_')
|
|
|
|
|
|
+ parts = dataset.split("_")
|
|
if parts[0] != country_code:
|
|
if parts[0] != country_code:
|
|
- cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
|
|
|
|
- dataset
|
|
|
|
|
|
+ cleaned_datasets_current_folder[
|
|
|
|
+ f"Wrong code: {parts[0]}"
|
|
|
|
+ ] = dataset
|
|
else:
|
|
else:
|
|
- terminology = "_".join(parts[3 : ])
|
|
|
|
|
|
+ terminology = "_".join(parts[3:])
|
|
key = f"{parts[1]} ({parts[2]}, {terminology})"
|
|
key = f"{parts[1]} ({parts[2]}, {terminology})"
|
|
data_info = ""
|
|
data_info = ""
|
|
- if '.nc' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ if ".nc" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "NF (.nc), "
|
|
data_info = data_info + "NF (.nc), "
|
|
- if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
|
|
|
|
|
|
+ if (".csv" in datasets_current_folder[dataset]) and (
|
|
|
|
+ ".yaml" in datasets_current_folder[dataset]
|
|
|
|
+ ):
|
|
data_info = data_info + "IF (.yaml + .csv), "
|
|
data_info = data_info + "IF (.yaml + .csv), "
|
|
- elif '.csv' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ elif ".csv" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "incomplete IF? (.csv), "
|
|
data_info = data_info + "incomplete IF? (.csv), "
|
|
- elif '.yaml' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ elif ".yaml" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "incomplete IF (.yaml), "
|
|
data_info = data_info + "incomplete IF (.yaml), "
|
|
|
|
|
|
code_file = get_code_file(country_code, parts[1])
|
|
code_file = get_code_file(country_code, parts[1])
|
|
@@ -680,7 +764,9 @@ def get_country_datasets(
|
|
if print_ds:
|
|
if print_ds:
|
|
if cleaned_datasets_current_folder:
|
|
if cleaned_datasets_current_folder:
|
|
for country_ds in cleaned_datasets_current_folder:
|
|
for country_ds in cleaned_datasets_current_folder:
|
|
- print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
|
|
|
|
|
|
+ print(
|
|
|
|
+ f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
print("No data available")
|
|
print("No data available")
|
|
print("")
|
|
print("")
|
|
@@ -708,34 +794,42 @@ def get_country_datasets(
|
|
else:
|
|
else:
|
|
country_folder = folder_mapping[country_code]
|
|
country_folder = folder_mapping[country_code]
|
|
if not isinstance(country_folder, str):
|
|
if not isinstance(country_folder, str):
|
|
- raise ValueError("Wrong data type in folder mapping json file. Should be str.")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ "Wrong data type in folder mapping json file. Should be str."
|
|
|
|
+ )
|
|
|
|
|
|
datasets_current_folder = {}
|
|
datasets_current_folder = {}
|
|
current_folder = item / country_folder
|
|
current_folder = item / country_folder
|
|
|
|
|
|
for data_file in current_folder.iterdir():
|
|
for data_file in current_folder.iterdir():
|
|
- if data_file.suffix in ['.nc', '.yaml', '.csv']:
|
|
|
|
|
|
+ if data_file.suffix in [".nc", ".yaml", ".csv"]:
|
|
if data_file.stem in datasets_current_folder:
|
|
if data_file.stem in datasets_current_folder:
|
|
- datasets_current_folder[data_file.stem].append(data_file.suffix)
|
|
|
|
|
|
+ datasets_current_folder[data_file.stem].append(
|
|
|
|
+ data_file.suffix
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
datasets_current_folder[data_file.stem] = [data_file.suffix]
|
|
datasets_current_folder[data_file.stem] = [data_file.suffix]
|
|
|
|
|
|
for dataset in datasets_current_folder:
|
|
for dataset in datasets_current_folder:
|
|
# process filename to get submission
|
|
# process filename to get submission
|
|
- parts = dataset.split('_')
|
|
|
|
|
|
+ parts = dataset.split("_")
|
|
if parts[0] != country_code:
|
|
if parts[0] != country_code:
|
|
- cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
|
|
|
|
|
|
+ cleaned_datasets_current_folder[
|
|
|
|
+ f"Wrong UNFCCC_GHG_data: {parts[0]}"
|
|
|
|
+ ] = dataset
|
|
else:
|
|
else:
|
|
- terminology = "_".join(parts[3 : ])
|
|
|
|
|
|
+ terminology = "_".join(parts[3:])
|
|
key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
|
|
key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
|
|
data_info = ""
|
|
data_info = ""
|
|
- if '.nc' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ if ".nc" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "NF (.nc), "
|
|
data_info = data_info + "NF (.nc), "
|
|
- if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
|
|
|
|
|
|
+ if (".csv" in datasets_current_folder[dataset]) and (
|
|
|
|
+ ".yaml" in datasets_current_folder[dataset]
|
|
|
|
+ ):
|
|
data_info = data_info + "IF (.yaml + .csv), "
|
|
data_info = data_info + "IF (.yaml + .csv), "
|
|
- elif '.csv' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ elif ".csv" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "incomplete IF? (.csv), "
|
|
data_info = data_info + "incomplete IF? (.csv), "
|
|
- elif '.yaml' in datasets_current_folder[dataset]:
|
|
|
|
|
|
+ elif ".yaml" in datasets_current_folder[dataset]:
|
|
data_info = data_info + "incomplete IF (.yaml), "
|
|
data_info = data_info + "incomplete IF (.yaml), "
|
|
|
|
|
|
cleaned_datasets_current_folder[key] = data_info
|
|
cleaned_datasets_current_folder[key] = data_info
|
|
@@ -743,7 +837,9 @@ def get_country_datasets(
|
|
if print_ds:
|
|
if print_ds:
|
|
if cleaned_datasets_current_folder:
|
|
if cleaned_datasets_current_folder:
|
|
for country_ds in cleaned_datasets_current_folder:
|
|
for country_ds in cleaned_datasets_current_folder:
|
|
- print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
|
|
|
|
|
|
+ print(
|
|
|
|
+ f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}"
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
print("No data available")
|
|
print("No data available")
|
|
print("")
|
|
print("")
|
|
@@ -759,9 +855,9 @@ def get_country_datasets(
|
|
|
|
|
|
|
|
|
|
def get_code_file(
|
|
def get_code_file(
|
|
- country_name: str,
|
|
|
|
- submission: str,
|
|
|
|
- print_info: bool = False,
|
|
|
|
|
|
+ country_name: str,
|
|
|
|
+ submission: str,
|
|
|
|
+ print_info: bool = False,
|
|
) -> Path:
|
|
) -> Path:
|
|
"""
|
|
"""
|
|
For given country name and submission find the script that creates the data
|
|
For given country name and submission find the script that creates the data
|
|
@@ -813,13 +909,17 @@ def get_code_file(
|
|
for file in country_folder.iterdir():
|
|
for file in country_folder.iterdir():
|
|
if file.match(code_file_name_candidate):
|
|
if file.match(code_file_name_candidate):
|
|
if code_file_path is not None:
|
|
if code_file_path is not None:
|
|
- raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
|
|
|
|
- f"{code_file_path} and file.name. "
|
|
|
|
- f"Please use only one file with name "
|
|
|
|
- f"'read_ISO3_submission_XXX.YYY'.")
|
|
|
|
|
|
+ raise ValueError(
|
|
|
|
+ f"Found multiple UNFCCC_GHG_data candidates: "
|
|
|
|
+ f"{code_file_path} and file.name. "
|
|
|
|
+ f"Please use only one file with name "
|
|
|
|
+ f"'read_ISO3_submission_XXX.YYY'."
|
|
|
|
+ )
|
|
else:
|
|
else:
|
|
if print_info:
|
|
if print_info:
|
|
- print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
|
|
|
|
|
|
+ print(
|
|
|
|
+ f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}"
|
|
|
|
+ )
|
|
code_file_path = file
|
|
code_file_path = file
|
|
|
|
|
|
if code_file_path is not None:
|
|
if code_file_path is not None:
|
|
@@ -828,8 +928,10 @@ def get_code_file(
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
-def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
|
|
|
|
- '''
|
|
|
|
|
|
+def fix_rows(
|
|
|
|
+ data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
|
|
|
|
+) -> pd.DataFrame:
|
|
|
|
+ """
|
|
Function to fix rows that have been split during reading from pdf
|
|
Function to fix rows that have been split during reading from pdf
|
|
This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
|
|
This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
|
|
|
|
|
|
@@ -838,18 +940,20 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
|
|
:param col_to_use:
|
|
:param col_to_use:
|
|
:param n_rows:
|
|
:param n_rows:
|
|
:return:
|
|
:return:
|
|
- '''
|
|
|
|
|
|
+ """
|
|
for row in rows_to_fix:
|
|
for row in rows_to_fix:
|
|
- #print(row)
|
|
|
|
|
|
+ # print(row)
|
|
# find the row number and collect the row and the next two rows
|
|
# find the row number and collect the row and the next two rows
|
|
index = data.loc[data[col_to_use] == row].index
|
|
index = data.loc[data[col_to_use] == row].index
|
|
- #print(list(index))
|
|
|
|
|
|
+ # print(list(index))
|
|
if not list(index):
|
|
if not list(index):
|
|
print(f"Can't merge split row {row}")
|
|
print(f"Can't merge split row {row}")
|
|
print(data[col_to_use])
|
|
print(data[col_to_use])
|
|
- #print(f"Merging split row {row} for table {page}")
|
|
|
|
|
|
+ # print(f"Merging split row {row} for table {page}")
|
|
loc = data.index.get_loc(index[0])
|
|
loc = data.index.get_loc(index[0])
|
|
- if n_rows == -3:
|
|
|
|
|
|
+ if n_rows == -2:
|
|
|
|
+ locs_to_merge = list(range(loc - 1, loc + 1))
|
|
|
|
+ elif n_rows == -3:
|
|
locs_to_merge = list(range(loc - 1, loc + 2))
|
|
locs_to_merge = list(range(loc - 1, loc + 2))
|
|
elif n_rows == -5:
|
|
elif n_rows == -5:
|
|
locs_to_merge = list(range(loc - 1, loc + 4))
|
|
locs_to_merge = list(range(loc - 1, loc + 4))
|
|
@@ -858,7 +962,7 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
|
|
rows_to_merge = data.iloc[locs_to_merge]
|
|
rows_to_merge = data.iloc[locs_to_merge]
|
|
indices_to_merge = rows_to_merge.index
|
|
indices_to_merge = rows_to_merge.index
|
|
# join the three rows
|
|
# join the three rows
|
|
- new_row = rows_to_merge.agg(' '.join)
|
|
|
|
|
|
+ new_row = rows_to_merge.agg(" ".join)
|
|
# replace the double spaces that are created
|
|
# replace the double spaces that are created
|
|
# must be done here and not at the end as splits are not always
|
|
# must be done here and not at the end as splits are not always
|
|
# the same and join would produce different col values
|
|
# the same and join would produce different col values
|
|
@@ -866,6 +970,10 @@ def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
|
|
new_row = new_row.str.replace("N O", "NO")
|
|
new_row = new_row.str.replace("N O", "NO")
|
|
new_row = new_row.str.replace(", N", ",N")
|
|
new_row = new_row.str.replace(", N", ",N")
|
|
new_row = new_row.str.replace("- ", "-")
|
|
new_row = new_row.str.replace("- ", "-")
|
|
|
|
+ # replace spaces in numbers
|
|
|
|
+ pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
|
|
|
|
+ repl = lambda m: f"{m.group('first')}{m.group('last')}"
|
|
|
|
+ new_row = new_row.str.replace(pat, repl, regex=True)
|
|
data.loc[indices_to_merge[0]] = new_row
|
|
data.loc[indices_to_merge[0]] = new_row
|
|
data = data.drop(indices_to_merge[1:])
|
|
data = data.drop(indices_to_merge[1:])
|
|
- return data
|
|
|
|
|
|
+ return data
|