functions.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216
  1. """common functions for unfccc_ghg_data
  2. Functions used by the different readers and downloaders in the unfccc_ghg_data package
  3. """
  4. from __future__ import annotations
  5. import copy
  6. import json
  7. import re
  8. import warnings
  9. from collections.abc import Hashable
  10. from copy import deepcopy
  11. from datetime import date
  12. from pathlib import Path
  13. import numpy as np
  14. import pandas as pd
  15. import pycountry
  16. import xarray as xr
  17. from .definitions import (
  18. GWP_factors,
  19. code_path,
  20. custom_country_mapping,
  21. custom_folders,
  22. downloaded_data_path,
  23. extracted_data_path,
  24. legacy_data_path,
  25. root_path,
  26. )
  27. def process_data_for_country( # noqa PLR0913, PLR0912, PLR0915
  28. data_country: xr.Dataset,
  29. entities_to_ignore: list[str],
  30. gas_baskets: dict[str, list[str]],
  31. filter_dims: dict[str, list[str]] | None = None,
  32. cat_terminology_out: str | None = None,
  33. category_conversion: dict[str, dict] | None = None,
  34. sectors_out: list[str] | None = None,
  35. processing_info_country: dict | None = None,
  36. ) -> xr.Dataset:
  37. """
  38. Process data from DI interface (where necessary).
  39. * Downscaling including subtraction of time series
  40. * country specific sector aggregation
  41. * Conversion to IPCC2006 categories
  42. * general sector and gas basket aggregation (in new categories)
  43. Parameters
  44. ----------
  45. data_country: xr.Dataset
  46. data to process
  47. entities_to_ignore: list[str]
  48. Which entities should be ignored. They will not be in the returned dataset
  49. gas_baskets: dict[str, list[str]
  50. Gas baskets to create. Each entry consists of the basket as key and a list of
  51. gases that make up the basket as value
  52. filter_dims: Optional[dict[str, list[str]]] = None
  53. filter data before processing. Filter is in the format taken by PRIMAP2's
  54. ds.pr.loc[] functionality
  55. cat_terminology_out: Optional[str] = None
  56. Category terminology for the output dataset
  57. category_conversion: dict[str, dict] = None
  58. Definition of category conversion. The dict has two possible fields:
  59. * "conversion" where the value is a dict[str, str] with 1 to 1 category code
  60. mapping (key is category from and value is category to)
  61. * "aggregation" TODO
  62. sectors_out: list[str] = None
  63. Categories to return
  64. processing_info_country
  65. more detailed processing info TODO: explain format
  66. The "aggregate_cats" flag is deprecated and will be removed in a future
  67. version. Please use "aggregate_coord" with key "category" instead
  68. Returns
  69. -------
  70. xr.Dataset: processed dataset
  71. """
  72. # 0: gather information
  73. countries = list(data_country.coords[data_country.attrs["area"]].values)
  74. if len(countries) > 1:
  75. raise ValueError( # noqa: TRY003
  76. f"Found {len(countries)} countries. Only single country data "
  77. f"can be processed by this function. countries: {countries}"
  78. )
  79. else:
  80. country_code = countries[0]
  81. # set default tolerance
  82. tolerance = 0.01
  83. # get category terminology
  84. cat_col = data_country.attrs["cat"]
  85. temp = re.findall(r"\((.*)\)", cat_col)
  86. cat_terminology_in = temp[0]
  87. # get scenario
  88. scenarios = list(data_country.coords[data_country.attrs["scen"]].values)
  89. if len(scenarios) > 1:
  90. raise ValueError( # noqa: TRY003
  91. f"Found {len(scenarios)} scenarios. Only single scenario data "
  92. f"can be processed by this function. Scenarios: {scenarios}"
  93. )
  94. scenario = scenarios[0]
  95. # get source
  96. sources = list(data_country.coords["source"].values)
  97. if len(sources) > 1:
  98. raise ValueError( # noqa: TRY003
  99. f"Found {len(sources)} sources. Only single source data "
  100. f"can be processed by this function. Sources: {sources}"
  101. )
  102. source = sources[0]
  103. # check if category name column present
  104. # TODO: replace 'name' in config by 'additional_cols' dict that defines the cols
  105. # and the values
  106. if "orig_cat_name" in data_country.coords:
  107. cat_name_present = True
  108. else:
  109. cat_name_present = False
  110. # 1: general processing
  111. # remove unused cats
  112. data_country = data_country.dropna(f"category ({cat_terminology_in})", how="all")
  113. # remove unused years
  114. data_country = data_country.dropna("time", how="all")
  115. # remove variables only containing nan
  116. nan_vars_country = [
  117. var
  118. for var in data_country.data_vars
  119. if bool(data_country[var].isnull().all().data) is True # noqa: PD003
  120. ]
  121. print(f"removing all-nan variables: {nan_vars_country}")
  122. data_country = data_country.drop_vars(nan_vars_country)
  123. # remove unnecessary variables
  124. entities_ignore_present = [
  125. entity for entity in entities_to_ignore if entity in data_country.data_vars
  126. ]
  127. data_country = data_country.drop_vars(entities_ignore_present)
  128. # filter ()
  129. if filter_dims is not None:
  130. data_country = data_country.pr.loc[filter_dims]
  131. # 2: country specific processing
  132. if processing_info_country is not None:
  133. if "tolerance" in processing_info_country:
  134. tolerance = processing_info_country["tolerance"]
  135. # remove entities if needed
  136. if "ignore_entities" in processing_info_country:
  137. entities_to_ignore_country = processing_info_country["ignore_entities"]
  138. entities_ignore_present = [
  139. entity
  140. for entity in entities_to_ignore_country
  141. if entity in data_country.data_vars
  142. ]
  143. data_country = data_country.drop_vars(entities_ignore_present)
  144. # take only desired years
  145. if "years" in processing_info_country:
  146. data_country = data_country.pr.loc[
  147. {"time": processing_info_country["years"]}
  148. ]
  149. # remove timeseries if desired
  150. if "remove_ts" in processing_info_country:
  151. for case in processing_info_country["remove_ts"]:
  152. remove_info = copy.deepcopy(processing_info_country["remove_ts"][case])
  153. entities = remove_info.pop("entities")
  154. for entity in entities:
  155. data_country[entity].pr.loc[remove_info] *= np.nan
  156. # remove all data for given years if necessary
  157. if "remove_years" in processing_info_country:
  158. data_country = data_country.drop_sel(
  159. time=processing_info_country["remove_years"]
  160. )
  161. # subtract categories
  162. if "subtract_cats" in processing_info_country:
  163. subtract_cats_current = processing_info_country["subtract_cats"]
  164. print(f"Subtracting categories for country {country_code}")
  165. for cat_to_generate in subtract_cats_current:
  166. if "entities" in subtract_cats_current[cat_to_generate].keys():
  167. entities_current = subtract_cats_current[cat_to_generate][
  168. "entities"
  169. ]
  170. else:
  171. entities_current = list(data_country.data_vars)
  172. cats_to_subtract = subtract_cats_current[cat_to_generate]["subtract"]
  173. data_sub = (
  174. data_country[entities_current]
  175. .pr.loc[{"category": cats_to_subtract}]
  176. .pr.sum(dim="category", skipna=True, min_count=1)
  177. )
  178. data_parent = data_country[entities_current].pr.loc[
  179. {"category": subtract_cats_current[cat_to_generate]["parent"]}
  180. ]
  181. data_agg = data_parent - data_sub
  182. nan_vars = [
  183. var
  184. for var in data_agg.data_vars
  185. if data_agg[var].isnull().all().data is True # noqa: PD003
  186. ]
  187. data_agg = data_agg.drop(nan_vars)
  188. if len(data_agg.data_vars) > 0:
  189. print(f"Generating {cat_to_generate} through subtraction")
  190. data_agg = data_agg.expand_dims(
  191. [f"category (" f"{cat_terminology_in})"]
  192. )
  193. data_agg = data_agg.assign_coords(
  194. coords={
  195. f"category ({cat_terminology_in})": (
  196. f"category ({cat_terminology_in})",
  197. [cat_to_generate],
  198. )
  199. }
  200. )
  201. if cat_name_present:
  202. cat_name = subtract_cats_current[cat_to_generate]["name"]
  203. data_agg = data_agg.assign_coords(
  204. coords={
  205. "orig_cat_name": (
  206. f"category ({cat_terminology_in})",
  207. [cat_name],
  208. )
  209. }
  210. )
  211. data_country = data_country.pr.merge(data_agg, tolerance=tolerance)
  212. else:
  213. print(f"no data to generate category {cat_to_generate}")
  214. # downscaling
  215. if "downscale" in processing_info_country:
  216. if "sectors" in processing_info_country["downscale"]:
  217. sector_downscaling = processing_info_country["downscale"]["sectors"]
  218. for case in sector_downscaling.keys():
  219. print(f"Downscaling for {case}.")
  220. sector_downscaling_current = sector_downscaling[case]
  221. entities = sector_downscaling_current.pop("entities")
  222. for entity in entities:
  223. data_country[entity] = data_country[
  224. entity
  225. ].pr.downscale_timeseries(**sector_downscaling_current)
  226. # , skipna_evaluation_dims=None)
  227. if "entities" in processing_info_country["downscale"]:
  228. entity_downscaling = processing_info_country["downscale"]["entities"]
  229. for case in entity_downscaling.keys():
  230. print(f"Downscaling for {case}.")
  231. # print(data_country.coords[f'category ('
  232. # f'{cat_terminology_in})'].values)
  233. data_country = data_country.pr.downscale_gas_timeseries(
  234. **entity_downscaling[case],
  235. skipna=True,
  236. skipna_evaluation_dims=None,
  237. )
  238. # aggregate categories
  239. if "aggregate_cats" in processing_info_country:
  240. warnings.warn(
  241. 'The "aggregate_cats" flag is deprecated and will '
  242. "be removed in a future version. Please use "
  243. '"aggregate_coords" with key "category" instead',
  244. category=DeprecationWarning,
  245. )
  246. print(
  247. f"Aggregating categories for country {country_code}, source {source}, "
  248. f"scenario {scenario}"
  249. )
  250. # prep input to add_aggregates_coordinates
  251. agg_info = {"category": processing_info_country["aggregate_cats"]}
  252. if "agg_tolerance" in processing_info_country:
  253. agg_tolerance = processing_info_country["agg_tolerance"]
  254. else:
  255. agg_tolerance = tolerance
  256. data_country = data_country.pr.add_aggregates_coordinates(
  257. agg_info=agg_info,
  258. tolerance=agg_tolerance,
  259. skipna=True,
  260. min_count=1,
  261. )
  262. if "aggregate_coords" in processing_info_country:
  263. print(
  264. f"Aggregating data for country {country_code}, source {source}, "
  265. f"scenario {scenario}"
  266. )
  267. data_country = data_country.pr.add_aggregates_coordinates(
  268. agg_info=processing_info_country["aggregate_coords"],
  269. skipna=True,
  270. min_count=1,
  271. )
  272. # copy HFCs and PFCs with default factors
  273. if "basket_copy" in processing_info_country:
  274. GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
  275. entities = processing_info_country["basket_copy"]["entities"]
  276. source_GWP = processing_info_country["basket_copy"]["source_GWP"]
  277. for entity in entities:
  278. data_source = data_country[f"{entity} ({source_GWP})"]
  279. for GWP in GWPs_to_add:
  280. data_GWP = (
  281. data_source * GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
  282. )
  283. data_GWP.attrs["entity"] = entity
  284. data_GWP.attrs["gwp_context"] = GWP
  285. data_country[f"{entity} ({GWP})"] = data_GWP
  286. # aggregate gases if desired
  287. if "aggregate_gases" in processing_info_country:
  288. data_country = data_country.pr.add_aggregates_variables(
  289. gases=processing_info_country["aggregate_gases"],
  290. )
  291. # 3: map categories
  292. if category_conversion is not None:
  293. data_country = convert_categories(
  294. data_country,
  295. category_conversion,
  296. cat_terminology_out,
  297. debug=False,
  298. tolerance=tolerance,
  299. )
  300. else:
  301. cat_terminology_out = cat_terminology_in
  302. # more general processing
  303. # reduce categories to output cats
  304. if sectors_out is not None:
  305. cats_to_keep = [
  306. cat
  307. for cat in data_country.coords[
  308. f"category ({cat_terminology_out})"
  309. ].to_numpy()
  310. if cat in sectors_out
  311. ]
  312. data_country = data_country.pr.loc[{"category": cats_to_keep}]
  313. # create gas baskets
  314. if gas_baskets:
  315. data_country = data_country.pr.add_aggregates_variables(
  316. gas_baskets=gas_baskets, skipna=True, min_count=1, tolerance=tolerance
  317. )
  318. # amend title and comment
  319. data_country.attrs["comment"] = (
  320. data_country.attrs["comment"] + f" Processed on " f"{date.today()}"
  321. )
  322. data_country.attrs["title"] = (
  323. data_country.attrs["title"] + f" Processed on " f"{date.today()}"
  324. )
  325. return data_country
  326. def convert_categories(
  327. ds_input: xr.Dataset,
  328. conversion: dict[str, dict[str, str]],
  329. # terminology_from: str,
  330. terminology_to: str,
  331. debug: bool = False,
  332. tolerance: float = 0.01,
  333. ) -> xr.Dataset:
  334. """
  335. convert data from one category terminology to another
  336. """
  337. print(f"converting categories to {terminology_to}")
  338. ds_converted = ds_input.copy(deep=True)
  339. ds_converted.attrs = deepcopy(ds_input.attrs)
  340. # TODO: change attrs for additional coordinates
  341. # change category terminology
  342. cat_dim = ds_converted.attrs["cat"]
  343. ds_converted.attrs["cat"] = f"category ({terminology_to})"
  344. ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
  345. # find categories present in dataset
  346. cats_present = list(ds_converted.coords[f"category ({terminology_to})"])
  347. # restrict categories and map category names
  348. if "mapping" in conversion.keys():
  349. mapping_cats_present = [
  350. cat for cat in list(conversion["mapping"].keys()) if cat in cats_present
  351. ]
  352. ds_converted = ds_converted.pr.loc[{"category": mapping_cats_present}]
  353. from_cats = ds_converted.coords[f"category ({terminology_to})"].to_numpy()
  354. to_cats = pd.Series(from_cats).replace(conversion["mapping"])
  355. ds_converted = ds_converted.assign_coords(
  356. {f"category ({terminology_to})": (f"category ({terminology_to})", to_cats)}
  357. )
  358. # aggregate categories
  359. if "aggregate" in conversion:
  360. agg_info = {
  361. "category": conversion["aggregate"],
  362. }
  363. ds_converted = ds_converted.pr.add_aggregates_coordinates(
  364. agg_info=agg_info,
  365. tolerance=tolerance,
  366. skipna=True,
  367. min_count=1,
  368. )
  369. return ds_converted
  370. def get_country_name(
  371. country_code: str,
  372. ) -> str:
  373. """Get country name from code"""
  374. if country_code in custom_country_mapping:
  375. country_name = custom_country_mapping[country_code]
  376. else:
  377. try:
  378. country = pycountry.countries.get(alpha_3=country_code)
  379. country_name = country.name
  380. except: # noqa: E722
  381. raise ValueError( # noqa: TRY003, TRY200
  382. f"Country code {country_code} can not be mapped to " f"any country"
  383. )
  384. return country_name
  385. def get_country_code(
  386. country_name: str,
  387. ) -> str:
  388. """
  389. Obtain country code.
  390. If the input is a code it will be returned,
  391. if the input is not a three letter code a search will be performed
  392. Parameters
  393. ----------
  394. country_name: str
  395. Country code or name to get the three-letter code for.
  396. Returns
  397. -------
  398. country_code: str
  399. """
  400. # First check if it's in the list of custom codes
  401. if country_name in custom_country_mapping:
  402. country_code = country_name
  403. else:
  404. try:
  405. # check if it's a 3 letter code
  406. country = pycountry.countries.get(alpha_3=country_name)
  407. country_code = country.alpha_3
  408. except: # noqa: E722
  409. try:
  410. country = pycountry.countries.search_fuzzy(
  411. country_name.replace("_", " ")
  412. )
  413. except: # noqa: E722
  414. raise ValueError( # noqa: TRY200, TRY003
  415. f"Country name {country_name} can not be mapped to "
  416. f"any country code. Try using the ISO3 code directly."
  417. )
  418. if len(country) > 1:
  419. country_code = None
  420. for current_country in country:
  421. if current_country.name == country_name:
  422. country_code = current_country.alpha_3
  423. if country_code is None:
  424. raise ValueError( # noqa: TRY200, TRY003
  425. f"Country name {country_name} has {len(country)} "
  426. f"possible results for country codes."
  427. )
  428. country_code = country[0].alpha_3
  429. return country_code
  430. def create_folder_mapping( # noqa: PLR0912
  431. folder: str, extracted: bool = False
  432. ) -> None:
  433. """
  434. Create a mapping of iso codes to folder names
  435. Create a mapping from 3 letter ISO country codes to folders
  436. based on the subfolders of the given folder. The mapping is
  437. stored in 'folder_mapping.json' in the given folder. Folder
  438. must be given relative to the repository root
  439. Parameters
  440. ----------
  441. folder: str
  442. folder to create the mapping for
  443. extracted: bool = False
  444. If true treat the folder as extracted data, where we
  445. only have one folder per country and no typos in the
  446. names
  447. Returns
  448. -------
  449. Nothing
  450. """
  451. folder = root_path / folder
  452. folder_mapping = {}
  453. # if not extracted:
  454. known_folders = custom_folders
  455. # else:
  456. # known_folders = {}
  457. for item in folder.iterdir():
  458. if item.is_dir() and not item.match("__pycache__"):
  459. if item.name in known_folders:
  460. ISO3 = known_folders[item.name]
  461. else:
  462. try:
  463. country = pycountry.countries.search_fuzzy(
  464. item.name.replace("_", " ")
  465. )
  466. if len(country) > 1:
  467. ISO3 = None
  468. for current_country in country:
  469. if current_country.name == item.name.replace("_", " "):
  470. ISO3 = current_country.alpha_3
  471. else:
  472. ISO3 = country[0].alpha_3
  473. except: # noqa: E722
  474. ISO3 = None
  475. if ISO3 is None:
  476. print(f"No match for {item.name}")
  477. elif ISO3 in folder_mapping.keys():
  478. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  479. else:
  480. folder_mapping[ISO3] = item.name
  481. with open(folder / "folder_mapping.json", "w") as mapping_file:
  482. json.dump(dict(sorted(folder_mapping.items())), mapping_file, indent=4)
  483. # TODO add crf
  484. def get_country_submissions( # noqa: PLR0912
  485. country_name: str,
  486. print_sub: bool = True,
  487. ) -> dict[str, list[str]]:
  488. """
  489. Get all submissions for a country
  490. Input is a three letter ISO code for a country, or the countries name.
  491. The function tries to map the country name to an ISO code and then
  492. queries the folder mapping files for folders.
  493. Parameters
  494. ----------
  495. country_name: str
  496. String containing the country name or ISO 3 letter code
  497. print_sub: bool
  498. If True information on submissions will be written to stdout
  499. Returns
  500. -------
  501. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  502. Each value is a list of folders
  503. """
  504. data_folder = downloaded_data_path
  505. country_code = get_country_code(country_name)
  506. if print_sub:
  507. print(f"Country name {country_name} maps to ISO code {country_code}")
  508. country_submissions = {}
  509. if print_sub:
  510. print("#" * 80)
  511. print(f"The following submissions are available for {country_name}")
  512. for item in data_folder.iterdir():
  513. if item.is_dir():
  514. if print_sub:
  515. print("")
  516. print("-" * 80)
  517. print(f"Data folder {item.name}")
  518. print("-" * 80)
  519. with open(item / "folder_mapping.json") as mapping_file:
  520. folder_mapping = json.load(mapping_file)
  521. if country_code in folder_mapping:
  522. country_folders = folder_mapping[country_code]
  523. if isinstance(country_folders, str):
  524. # only one folder
  525. country_folders = [country_folders]
  526. submission_folders = []
  527. for country_folder in country_folders:
  528. current_folder = item / country_folder
  529. if print_sub:
  530. print(f"Submissions in folder {country_folder}:")
  531. for submission_folder in current_folder.iterdir():
  532. if submission_folder.is_dir():
  533. if print_sub:
  534. print(submission_folder.name)
  535. submission_folders.append(submission_folder.name)
  536. country_submissions[item.name] = submission_folders
  537. else:
  538. print(f"No submissions available for {country_name}.")
  539. return country_submissions
  540. def get_country_datasets( # noqa: PLR0915, PLR0912
  541. country_name: str,
  542. print_ds: bool = True,
  543. ) -> dict[str, list[str]]:
  544. """
  545. Get all datasets for a country
  546. Input is a three letter ISO code for a country, or the country's name.
  547. The function tries to map the country name to an ISO code and then
  548. checks the code and data folders for content on the country.
  549. Parameters
  550. ----------
  551. country_name: str
  552. String containing the country name or ISO 3 letter code
  553. print_ds: bool
  554. If True information on submissions will be written to stdout
  555. Returns
  556. -------
  557. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  558. Each value is a list of folders
  559. """
  560. data_folder = extracted_data_path
  561. data_folder_legacy = legacy_data_path
  562. # obtain country code
  563. country_code = get_country_code(country_name)
  564. if print_ds:
  565. print(f"Country name {country_name} maps to ISO code {country_code}")
  566. rep_data = {}
  567. # data
  568. if print_ds:
  569. print("#" * 80)
  570. print(f"The following datasets are available for {country_name}")
  571. for item in data_folder.iterdir():
  572. if item.is_dir():
  573. cleaned_datasets_current_folder = {}
  574. if print_ds:
  575. print("-" * 80)
  576. print(f"Data folder {item.name}")
  577. print("-" * 80)
  578. with open(item / "folder_mapping.json") as mapping_file:
  579. folder_mapping = json.load(mapping_file)
  580. if country_code not in folder_mapping:
  581. if print_ds:
  582. print("No data available")
  583. print("")
  584. else:
  585. country_folder = folder_mapping[country_code]
  586. if not isinstance(country_folder, str):
  587. raise ValueError( # noqa: TRY003
  588. "Wrong data type in folder mapping json file. Should be str."
  589. )
  590. datasets_current_folder = {}
  591. current_folder = item / country_folder
  592. for data_file in current_folder.iterdir():
  593. if data_file.suffix in [".nc", ".yaml", ".csv"]:
  594. if data_file.stem in datasets_current_folder:
  595. datasets_current_folder[data_file.stem].append(
  596. data_file.suffix
  597. )
  598. else:
  599. datasets_current_folder[data_file.stem] = [data_file.suffix]
  600. for dataset in datasets_current_folder:
  601. # process filename to get submission
  602. parts = dataset.split("_")
  603. if parts[0] != country_code:
  604. cleaned_datasets_current_folder[
  605. f"Wrong code: {parts[0]}"
  606. ] = dataset
  607. else:
  608. terminology = "_".join(parts[3:])
  609. key = f"{parts[1]} ({parts[2]}, {terminology})"
  610. data_info = ""
  611. if ".nc" in datasets_current_folder[dataset]:
  612. data_info = data_info + "NF (.nc), "
  613. if (".csv" in datasets_current_folder[dataset]) and (
  614. ".yaml" in datasets_current_folder[dataset]
  615. ):
  616. data_info = data_info + "IF (.yaml + .csv), "
  617. elif ".csv" in datasets_current_folder[dataset]:
  618. data_info = data_info + "incomplete IF? (.csv), "
  619. elif ".yaml" in datasets_current_folder[dataset]:
  620. data_info = data_info + "incomplete IF (.yaml), "
  621. code_file = get_code_file(country_code, parts[1])
  622. if code_file:
  623. data_info = data_info + f"code: {code_file.name}"
  624. else:
  625. data_info = data_info + "code: not found"
  626. cleaned_datasets_current_folder[key] = data_info
  627. if print_ds:
  628. if cleaned_datasets_current_folder:
  629. for country_ds in cleaned_datasets_current_folder:
  630. print(
  631. f"{country_ds}: "
  632. f"{cleaned_datasets_current_folder[country_ds]}"
  633. )
  634. else:
  635. print("No data available")
  636. print("")
  637. rep_data[item.name] = cleaned_datasets_current_folder
  638. # legacy data
  639. if print_ds:
  640. print("#" * 80)
  641. print(f"The following legacy datasets are available for {country_name}")
  642. legacy_data = {}
  643. for item in data_folder_legacy.iterdir():
  644. if item.is_dir():
  645. cleaned_datasets_current_folder = {}
  646. if print_ds:
  647. print("-" * 80)
  648. print(f"Data folder {item.name}")
  649. print("-" * 80)
  650. with open(item / "folder_mapping.json") as mapping_file:
  651. folder_mapping = json.load(mapping_file)
  652. if country_code not in folder_mapping:
  653. if print_ds:
  654. print("No data available")
  655. print("")
  656. else:
  657. country_folder = folder_mapping[country_code]
  658. if not isinstance(country_folder, str):
  659. raise ValueError( # noqa: TRY003
  660. "Wrong data type in folder mapping json file. Should be str."
  661. )
  662. datasets_current_folder = {}
  663. current_folder = item / country_folder
  664. for data_file in current_folder.iterdir():
  665. if data_file.suffix in [".nc", ".yaml", ".csv"]:
  666. if data_file.stem in datasets_current_folder:
  667. datasets_current_folder[data_file.stem].append(
  668. data_file.suffix
  669. )
  670. else:
  671. datasets_current_folder[data_file.stem] = [data_file.suffix]
  672. for dataset in datasets_current_folder:
  673. # process filename to get submission
  674. parts = dataset.split("_")
  675. if parts[0] != country_code:
  676. cleaned_datasets_current_folder[
  677. f"Wrong code: {parts[0]}"
  678. ] = dataset
  679. else:
  680. terminology = "_".join(parts[3:])
  681. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  682. data_info = ""
  683. if ".nc" in datasets_current_folder[dataset]:
  684. data_info = data_info + "NF (.nc), "
  685. if (".csv" in datasets_current_folder[dataset]) and (
  686. ".yaml" in datasets_current_folder[dataset]
  687. ):
  688. data_info = data_info + "IF (.yaml + .csv), "
  689. elif ".csv" in datasets_current_folder[dataset]:
  690. data_info = data_info + "incomplete IF? (.csv), "
  691. elif ".yaml" in datasets_current_folder[dataset]:
  692. data_info = data_info + "incomplete IF (.yaml), "
  693. cleaned_datasets_current_folder[key] = data_info
  694. if print_ds:
  695. if cleaned_datasets_current_folder:
  696. for country_ds in cleaned_datasets_current_folder:
  697. print(
  698. f"{country_ds}: "
  699. f"{cleaned_datasets_current_folder[country_ds]}"
  700. )
  701. else:
  702. print("No data available")
  703. print("")
  704. legacy_data[item.name] = cleaned_datasets_current_folder
  705. all_data = {
  706. "rep_data": rep_data,
  707. "legacy_data": legacy_data,
  708. }
  709. return all_data
  710. def get_code_file(
  711. country_name: str,
  712. submission: str,
  713. print_info: bool = False,
  714. ) -> Path:
  715. """
  716. For given country name and submission find the script that creates the data
  717. Parameters
  718. ----------
  719. country_name: str
  720. String containing the country name or ISO 3 letter code
  721. submission: str
  722. String of the submission
  723. print_info: bool = False
  724. If True print information on code found
  725. Returns
  726. -------
  727. returns a pathlib Path object for the code file
  728. """
  729. code_file_path = None
  730. UNFCCC_reader_path = code_path / "unfccc_reader"
  731. # CRF is an exception as it's read using the unfccc_crf_reader module
  732. # so we return the path to that.
  733. if submission[0:3] in ("CRF", "CRT"):
  734. return root_path / "unfccc_crf_reader"
  735. if submission[0:2] == "DI":
  736. return root_path / "unfccc_di_reader"
  737. # replace "-" by "_" in submission
  738. submission = submission.replace("-", "_")
  739. # obtain country code
  740. country_code = get_country_code(country_name)
  741. if print_info:
  742. print(f"Country name {country_name} maps to ISO code {country_code}")
  743. with open(UNFCCC_reader_path / "folder_mapping.json") as mapping_file:
  744. folder_mapping = json.load(mapping_file)
  745. if country_code not in folder_mapping:
  746. if print_info:
  747. print("No code available")
  748. print("")
  749. else:
  750. country_folder = UNFCCC_reader_path / folder_mapping[country_code]
  751. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  752. for file in country_folder.iterdir():
  753. if file.match(code_file_name_candidate):
  754. if code_file_path is not None:
  755. raise ValueError( # noqa: TRY003
  756. f"Found multiple code candidates: "
  757. f"{code_file_path} and file.name. "
  758. f"Please use only one file with name "
  759. f"'read_ISO3_submission_XXX.YYY'."
  760. )
  761. elif print_info:
  762. print(f"Found code file {file.relative_to(root_path)}")
  763. code_file_path = file
  764. if code_file_path is not None:
  765. return code_file_path.relative_to(root_path)
  766. else:
  767. return None
  768. def fix_rows(
  769. data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int
  770. ) -> pd.DataFrame:
  771. """
  772. Fix rows that have been split during reading from pdf
  773. This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
  774. Parameters
  775. ----------
  776. data
  777. rows_to_fix
  778. col_to_use
  779. n_rows
  780. Returns
  781. -------
  782. Dataframe with fixed rows
  783. """
  784. for row in rows_to_fix:
  785. # print(row)
  786. # find the row number and collect the row and the next two rows
  787. index = data.loc[data[col_to_use] == row].index
  788. # print(list(index))
  789. if not list(index):
  790. print(f"Can't merge split row {row}")
  791. print(data[col_to_use])
  792. # print(f"Merging split row {row} for table {page}")
  793. loc = data.index.get_loc(index[0])
  794. # TODO: formula for negative values
  795. if n_rows == -2: # noqa: PLR2004
  796. locs_to_merge = list(range(loc - 1, loc + 1))
  797. elif n_rows == -3: # noqa: PLR2004
  798. locs_to_merge = list(range(loc - 1, loc + 2))
  799. elif n_rows == -5: # noqa: PLR2004
  800. locs_to_merge = list(range(loc - 1, loc + 4))
  801. else:
  802. locs_to_merge = list(range(loc, loc + n_rows))
  803. rows_to_merge = data.iloc[locs_to_merge]
  804. indices_to_merge = rows_to_merge.index
  805. # join the three rows
  806. new_row = rows_to_merge.agg(" ".join)
  807. # replace the double spaces that are created
  808. # must be done here and not at the end as splits are not always
  809. # the same and join would produce different col values
  810. new_row = new_row.str.replace(" ", " ")
  811. new_row = new_row.str.replace("N O", "NO")
  812. new_row = new_row.str.replace(", N", ",N")
  813. new_row = new_row.str.replace("- ", "-")
  814. new_row = new_row.str.strip()
  815. # replace spaces in numbers
  816. pat = r"^(?P<first>[0-9\.,]*)\s(?P<last>[0-9\.,]*)$"
  817. def repl(m):
  818. return f"{m.group('first')}{m.group('last')}"
  819. new_row = new_row.str.replace(pat, repl, regex=True)
  820. data.loc[indices_to_merge[0]] = new_row
  821. data = data.drop(indices_to_merge[1:])
  822. return data
  823. def make_wide_table(
  824. data: pd.DataFrame,
  825. keyword: str,
  826. col: int | str,
  827. index_cols: list[int | str],
  828. ) -> pd.DataFrame:
  829. """
  830. Make a wide table from a table which is a stack of tables for different time periods
  831. Parameters
  832. ----------
  833. data
  834. Input table as pandas.DataFrame
  835. keyword
  836. col
  837. index_cols
  838. Returns
  839. -------
  840. pandas.DataFrame in wide format
  841. """
  842. index = data.loc[data[col] == keyword].index
  843. if not list(index):
  844. print("Keyword for table transformation not found")
  845. return data
  846. elif len(index) == 1:
  847. print("Keyword for table transformation found only once")
  848. return data
  849. else:
  850. df_all = None
  851. for i, item in enumerate(index):
  852. loc = data.index.get_loc(item)
  853. if i < len(index) - 1:
  854. next_loc = data.index.get_loc(index[i + 1])
  855. else:
  856. next_loc = data.index[-1] + 1
  857. df_to_add = data.loc[list(range(loc, next_loc))]
  858. # select only cols which don't have NaN, Null, or '' as header
  859. filter_nan = (
  860. (~df_to_add.iloc[0].isna())
  861. & (df_to_add.iloc[0] != "NaN")
  862. & (df_to_add.iloc[0])
  863. )
  864. df_to_add = df_to_add.loc[:, filter_nan]
  865. df_to_add.columns = df_to_add.iloc[0]
  866. # print(df_to_add.columns)
  867. df_to_add = df_to_add.drop(loc)
  868. df_to_add = df_to_add.set_index(index_cols)
  869. if df_all is None:
  870. df_all = df_to_add
  871. else:
  872. df_all = pd.concat([df_all, df_to_add], axis=1, join="outer")
  873. return df_all
  874. def find_and_replace_values(
  875. df: pd.DataFrame,
  876. replace_info: list[tuple[str | float]],
  877. category_column: str,
  878. entity_column: str = "entity",
  879. ) -> pd.DataFrame:
  880. """
  881. Find values and replace single values in a dataframe.
  882. Parameters
  883. ----------
  884. df
  885. Input data frame
  886. replace_info
  887. Category, entity, year, and new value. Don't put a new value if you
  888. would like to replace with nan.
  889. For example [("3.C", "CO", "2019", 3.423)] or [("3.C", "CO", "2019")]
  890. category_column
  891. The name of the column that contains the categories.
  892. entity_column
  893. The name of the column that contains the categories.
  894. Output
  895. ------
  896. Data frame with updated values.
  897. """
  898. for replace_info_value in replace_info:
  899. category = replace_info_value[0]
  900. entity = replace_info_value[1]
  901. year = replace_info_value[2]
  902. if len(replace_info_value) == 4: # noqa: PLR2004
  903. new_value = replace_info_value[3]
  904. elif len(replace_info_value) == 3: # noqa: PLR2004
  905. new_value = np.nan
  906. else:
  907. raise AssertionError( # noqa: TRY003
  908. f"Expected tuple of length 3 or 4. Got {replace_info_value}"
  909. )
  910. index = df.loc[
  911. (df[category_column] == category) & (df[entity_column] == entity),
  912. ].index[0]
  913. # pandas recommends using .at[] for changing single values
  914. df.loc[index, year] = new_value
  915. print(f"Set value for {category}, {entity}, {year} to {new_value}.")
  916. return df
  917. def set_to_nan_in_ds(
  918. ds_in: xr.Dataset,
  919. entities: list[Hashable],
  920. filter: dict[str, any],
  921. ) -> xr.Dataset:
  922. """
  923. Set values to NaN in a data set.
  924. Parameters
  925. ----------
  926. ds_in:
  927. input dataset
  928. entities
  929. list of entities to work on
  930. filter
  931. .pr.loc type selector which selects the elements that should be replaced
  932. with nan
  933. Returns
  934. -------
  935. xr.Dataset with the desired values set to nan
  936. """
  937. ds_mask = xr.zeros_like(ds_in[entities].pr.loc[filter]).combine_first(
  938. xr.ones_like(ds_in)
  939. )
  940. return ds_in.where(ds_mask)
  941. def assert_values(
  942. df: pd.DataFrame,
  943. test_case: tuple[str | float | int],
  944. category_column: str = "category (IPCC1996_2006_GIN_Inv)",
  945. entity_column: str = "entity",
  946. ) -> None:
  947. """
  948. Check if a value in a dataframe matches the expected value.
  949. Parameters
  950. ----------
  951. df
  952. The data frame to check.
  953. test_case
  954. The combination of parameters and the expected value.
  955. Use the format (<category>, <entity>, <year>, <expected_value>).
  956. category_column
  957. The columns where to look for the category.
  958. entity_column
  959. The column where to look for the entity.
  960. """
  961. category = test_case[0]
  962. entity = test_case[1]
  963. year = test_case[2]
  964. expected_value = test_case[3]
  965. assert isinstance( # noqa: S101
  966. expected_value, (float, int)
  967. ), (
  968. "This function only works for numbers. "
  969. "Use assert_nan_values to check for NaNs "
  970. "and empty values."
  971. )
  972. arr = df.loc[
  973. (df[category_column] == category) & (df[entity_column] == entity), year
  974. ].to_numpy()
  975. # Assert the category exists in the data frame
  976. assert ( # noqa: S101
  977. category in df[category_column].unique()
  978. ), f"{category} is not a valid category. Choose from {df[category_column].unique()}"
  979. # Assert the entity exists in the data frame
  980. assert ( # noqa: S101
  981. entity in df[entity_column].unique()
  982. ), f"{entity} is not a valid entity. Choose from {df[entity_column].unique()}"
  983. assert ( # noqa: S101
  984. arr.size > 0
  985. ), f"No value found for category {category}, entity {entity}, year {year}!"
  986. assert ( # noqa: S101
  987. arr.size <= 1
  988. ), (
  989. f"More than one value found for category {category}, entity {entity}, "
  990. f"year {year}!"
  991. )
  992. assert ( # noqa: S101
  993. arr[0] == test_case[3]
  994. ), f"Expected value {expected_value}, actual value is {arr[0]}"
  995. print(
  996. f"Value for category {category}, entity {entity}, year {year} is as expected."
  997. )
  998. def assert_nan_values(
  999. df: pd.DataFrame,
  1000. test_case: tuple[str, ...],
  1001. category_column: str = "category (IPCC1996_2006_GIN_Inv)",
  1002. entity_column: str = "entity",
  1003. ) -> None:
  1004. """
  1005. Check for empty, NE, NE1 values
  1006. Check if values that are empty or NE or NE1 in the PDF tables
  1007. are not present in the dataset.
  1008. Parameters
  1009. ----------
  1010. df
  1011. The data frame to check.
  1012. test_case
  1013. The combination of input parameters.
  1014. Use the format (<category>, <entity>, <year>).
  1015. category_column
  1016. The columns where to look for the category.
  1017. entity_column
  1018. The column where to look for the entity.
  1019. """
  1020. category = test_case[0]
  1021. entity = test_case[1]
  1022. year = test_case[2]
  1023. if category not in df[category_column].unique():
  1024. warning_string = (
  1025. f"{category} is not in the data set. Either all values "
  1026. f"for this category are NaN or the category never "
  1027. f"existed in the data set."
  1028. )
  1029. warnings.warn(warning_string)
  1030. return
  1031. if entity not in df[entity_column].unique():
  1032. warning_string = (
  1033. f"{entity} is not in the data set. Either all values "
  1034. f"for this entity are NaN or the category never "
  1035. f"existed in the data set."
  1036. )
  1037. warnings.warn(warning_string)
  1038. return
  1039. arr = df.loc[
  1040. (df[category_column] == category) & (df[entity_column] == entity), year
  1041. ].to_numpy()
  1042. assert np.isnan(arr[0]), f"Value is {arr[0]} and not NaN." # noqa: S101
  1043. print(f"Value for category {category}, entity {entity}, year {year} is NaN.")