functions.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810
  1. import pycountry
  2. import json
  3. import re
  4. import xarray as xr
  5. import pandas as pd
  6. import numpy as np
  7. from datetime import date
  8. from copy import deepcopy
  9. from typing import Dict, List, Optional
  10. from pathlib import Path
  11. from .definitions import custom_country_mapping, custom_folders
  12. from .definitions import root_path, downloaded_data_path, extracted_data_path
  13. from .definitions import legacy_data_path, code_path
  14. def process_data_for_country(
  15. data_country: xr.Dataset,
  16. entities_to_ignore: List[str],
  17. gas_baskets: Dict[str, List[str]],
  18. filter_dims: Optional[Dict[str, List[str]]] = None,
  19. cat_terminology_out: Optional[str] = None,
  20. category_conversion: Dict[str, Dict] = None,
  21. sectors_out: List[str] = None,
  22. processing_info_country: Dict = None,
  23. ) -> xr.Dataset:
  24. """
  25. Process data from DI interface (where necessary).
  26. * Downscaling including subtraction of time series
  27. * country specific sector aggregation
  28. * Conversion to IPCC2006 categories
  29. * general sector and gas basket aggregation (in new categories)
  30. """
  31. # 0: gather information
  32. countries = list(data_country.coords[data_country.attrs['area']].values)
  33. if len(countries) > 1:
  34. raise ValueError(
  35. f"Found {len(countries)} countries. Only single country data "
  36. f"can be processed by this function. countries: {countries}")
  37. else:
  38. country_code = countries[0]
  39. # get category terminology
  40. cat_col = data_country.attrs['cat']
  41. temp = re.findall(r'\((.*)\)', cat_col)
  42. cat_terminology_in = temp[0]
  43. # get scenario
  44. scenarios = list(data_country.coords[data_country.attrs['scen']].values)
  45. if len(scenarios) > 1:
  46. raise ValueError(
  47. f"Found {len(scenarios)} scenarios. Only single scenario data "
  48. f"can be processed by this function. Scenarios: {scenarios}")
  49. scenario = scenarios[0]
  50. # get source
  51. sources = list(data_country.coords['source'].values)
  52. if len(sources) > 1:
  53. raise ValueError(
  54. f"Found {len(sources)} sources. Only single source data "
  55. f"can be processed by this function. Sources: {sources}")
  56. source = sources[0]
  57. # check if category name column present
  58. # TODO: replace 'name' in config by 'additional_cols' dict that defines the cols
  59. # and the values
  60. if 'orig_cat_name' in data_country.coords:
  61. cat_name_present = True
  62. else:
  63. cat_name_present = False
  64. # 1: general processing
  65. # remove unused cats
  66. data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
  67. # remove unused years
  68. data_country = data_country.dropna(f'time', how='all')
  69. # remove variables only containing nan
  70. nan_vars_country = [var for var in data_country.data_vars if
  71. data_country[var].isnull().all().data is True]
  72. print(f"removing all-nan variables: {nan_vars_country}")
  73. data_country = data_country.drop_vars(nan_vars_country)
  74. # remove unnecessary variables
  75. entities_ignore_present = [entity for entity in entities_to_ignore if
  76. entity in data_country.data_vars]
  77. data_country = data_country.drop_vars(entities_ignore_present)
  78. # filter ()
  79. if filter_dims is not None:
  80. data_country = data_country.pr.loc[filter_dims]
  81. # 2: country specific processing
  82. if processing_info_country is not None:
  83. if 'tolerance' in processing_info_country:
  84. tolerance = processing_info_country["tolerance"]
  85. else:
  86. tolerance = 0.01
  87. # remove entities if needed
  88. if 'ignore_entities' in processing_info_country:
  89. entities_to_ignore_country = processing_info_country[
  90. 'ignore_entities']
  91. entities_ignore_present = \
  92. [entity for entity in entities_to_ignore_country if
  93. entity in data_country.data_vars]
  94. data_country = data_country.drop_vars(entities_ignore_present)
  95. # take only desired years
  96. if 'years' in processing_info_country:
  97. data_country = data_country.pr.loc[
  98. {'time': processing_info_country['years']}]
  99. # remove timeseries if desired
  100. if 'remove_ts' in processing_info_country:
  101. for case in processing_info_country['remove_ts']:
  102. remove_info = processing_info_country['remove_ts'][case]
  103. entities = remove_info.pop("entities")
  104. for entity in entities:
  105. data_country[entity].pr.loc[remove_info] = \
  106. data_country[entity].pr.loc[remove_info] * np.nan
  107. # remove all data for given years if necessary
  108. if 'remove_years' in processing_info_country:
  109. data_country = data_country.drop_sel(
  110. time=processing_info_country['remove_years'])
  111. # subtract categories
  112. if 'subtract_cats' in processing_info_country:
  113. subtract_cats_current = processing_info_country['subtract_cats']
  114. if 'entities' in subtract_cats_current.keys():
  115. entities_current = subtract_cats_current['entities']
  116. else:
  117. entities_current = list(data_country.data_vars)
  118. print(f"Subtracting categories for country {country_code}, entities "
  119. f"{entities_current}")
  120. for cat_to_generate in subtract_cats_current:
  121. cats_to_subtract = \
  122. subtract_cats_current[cat_to_generate]['subtract']
  123. data_sub = \
  124. data_country.pr.loc[{'category': cats_to_subtract}].pr.sum(
  125. dim='category', skipna=True, min_count=1)
  126. data_parent = data_country.pr.loc[
  127. {'category': subtract_cats_current[cat_to_generate]['parent']}]
  128. data_agg = data_parent - data_sub
  129. nan_vars = [var for var in data_agg.data_vars if
  130. data_agg[var].isnull().all().data is True]
  131. data_agg = data_agg.drop(nan_vars)
  132. if len(data_agg.data_vars) > 0:
  133. print(f"Generating {cat_to_generate} through subtraction")
  134. data_agg = data_agg.expand_dims([f'category ('
  135. f'{cat_terminology_in})'])
  136. data_agg = data_agg.assign_coords(
  137. coords={f'category ({cat_terminology_in})':
  138. (f'category ({cat_terminology_in})',
  139. [cat_to_generate])})
  140. if cat_name_present:
  141. cat_name = subtract_cats_current[cat_to_generate]['name']
  142. data_agg = data_agg.assign_coords(
  143. coords={'orig_cat_name':
  144. (f'category ({cat_terminology_in})',
  145. [cat_name])})
  146. data_country = data_country.pr.merge(data_agg,
  147. tolerance=tolerance)
  148. else:
  149. print(f"no data to generate category {cat_to_generate}")
  150. # downscaling
  151. if 'downscale' in processing_info_country:
  152. if 'sectors' in processing_info_country['downscale']:
  153. sector_downscaling = \
  154. processing_info_country['downscale']['sectors']
  155. for case in sector_downscaling.keys():
  156. print(f"Downscaling for {case}.")
  157. sector_downscaling_current = sector_downscaling[case]
  158. entities = sector_downscaling_current.pop('entities')
  159. for entity in entities:
  160. data_country[entity] = data_country[
  161. entity].pr.downscale_timeseries(
  162. **sector_downscaling_current)
  163. # , skipna_evaluation_dims=None)
  164. if 'entities' in processing_info_country['downscale']:
  165. entity_downscaling = \
  166. processing_info_country['downscale']['entities']
  167. for case in entity_downscaling.keys():
  168. print(f"Downscaling for {case}.")
  169. # print(data_country.coords[f'category ('
  170. # f'{cat_terminology_in})'].values)
  171. data_country = data_country.pr.downscale_gas_timeseries(
  172. **entity_downscaling[case], skipna=True,
  173. skipna_evaluation_dims=None)
  174. # aggregate categories
  175. if 'aggregate_cats' in processing_info_country:
  176. if 'agg_tolerance' in processing_info_country:
  177. agg_tolerance = processing_info_country['agg_tolerance']
  178. else:
  179. agg_tolerance = tolerance
  180. aggregate_cats_current = processing_info_country['aggregate_cats']
  181. print(
  182. f"Aggregating categories for country {country_code}, source {source}, "
  183. f"scenario {scenario}")
  184. for cat_to_agg in aggregate_cats_current:
  185. print(f"Category: {cat_to_agg}")
  186. source_cats = aggregate_cats_current[cat_to_agg]['sources']
  187. data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
  188. dim='category', skipna=True, min_count=1)
  189. nan_vars = [var for var in data_agg.data_vars if
  190. data_agg[var].isnull().all().data is True]
  191. data_agg = data_agg.drop(nan_vars)
  192. if len(data_agg.data_vars) > 0:
  193. data_agg = data_agg.expand_dims([f'category ('
  194. f'{cat_terminology_in})'])
  195. data_agg = data_agg.assign_coords(
  196. coords={f'category ({cat_terminology_in})':
  197. (f'category ({cat_terminology_in})',
  198. [cat_to_agg])})
  199. if cat_name_present:
  200. cat_name = aggregate_cats_current[cat_to_agg]['name']
  201. data_agg = data_agg.assign_coords(
  202. coords={'orig_cat_name':
  203. (f'category ({cat_terminology_in})',
  204. [cat_name])})
  205. data_country = data_country.pr.merge(data_agg,
  206. tolerance=agg_tolerance)
  207. else:
  208. print(f"no data to aggregate category {cat_to_agg}")
  209. # aggregate gases if desired
  210. if 'aggregate_gases' in processing_info_country:
  211. # TODO: why use different code here than below. Can this fill non-existen
  212. # gas baskets?
  213. for case in processing_info_country['aggregate_gases'].keys():
  214. case_info = processing_info_country['aggregate_gases'][case]
  215. data_country[case_info['basket']] = \
  216. data_country.pr.fill_na_gas_basket_from_contents(
  217. **case_info)
  218. # 3: map categories
  219. if category_conversion is not None:
  220. data_country = convert_categories(
  221. data_country,
  222. category_conversion,
  223. cat_terminology_out,
  224. debug=False,
  225. tolerance=0.01,
  226. )
  227. else:
  228. cat_terminology_out = cat_terminology_in
  229. # more general processing
  230. # reduce categories to output cats
  231. if sectors_out is not None:
  232. cats_to_keep = [cat for cat in
  233. data_country.coords[f'category ({cat_terminology_out})'].values
  234. if cat in sectors_out]
  235. data_country = data_country.pr.loc[{'category': cats_to_keep}]
  236. # create gas baskets
  237. entities_present = set(data_country.data_vars)
  238. for basket in gas_baskets.keys():
  239. basket_contents_present = [gas for gas in gas_baskets[basket] if
  240. gas in entities_present]
  241. if len(basket_contents_present) > 0:
  242. if basket in list(data_country.data_vars):
  243. data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
  244. basket=basket, basket_contents=basket_contents_present,
  245. skipna=True, min_count=1)
  246. else:
  247. try:
  248. #print(data_country.data_vars)
  249. data_country[basket] = xr.full_like(data_country["CO2"],
  250. np.nan).pr.quantify(
  251. units="Gg CO2 / year")
  252. data_country[basket].attrs = {"entity": basket.split(' ')[0],
  253. "gwp_context": basket.split(' ')[1][
  254. 1:-1]}
  255. data_country[basket] = data_country.pr.gas_basket_contents_sum(
  256. basket=basket, basket_contents=basket_contents_present,
  257. min_count=1)
  258. entities_present.add(basket)
  259. except Exception as ex:
  260. print(f"No gas basket created for {country_code}, {source}, "
  261. f"{scenario}: {ex}")
  262. # amend title and comment
  263. data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
  264. f"{date.today()}"
  265. data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
  266. f"{date.today()}"
  267. return data_country
  268. def convert_categories(
  269. ds_input: xr.Dataset,
  270. conversion: Dict[str, Dict[str, str]],
  271. #terminology_from: str,
  272. terminology_to: str,
  273. debug: bool=False,
  274. tolerance: float=0.01,
  275. )->xr.Dataset:
  276. """
  277. convert data from one category terminology to another
  278. """
  279. print(f"converting categories to {terminology_to}")
  280. if 'orig_cat_name' in ds_input.coords:
  281. cat_name_present = True
  282. else:
  283. cat_name_present = False
  284. ds_converted = ds_input.copy(deep=True)
  285. ds_converted.attrs = deepcopy(ds_input.attrs)
  286. # TODO: change attrs for additional coordinates
  287. # change category terminology
  288. cat_dim = ds_converted.attrs["cat"]
  289. ds_converted.attrs["cat"] = f"category ({terminology_to})"
  290. ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
  291. # find categories present in dataset
  292. cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
  293. # restrict categories and map category names
  294. if 'mapping' in conversion.keys():
  295. mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
  296. cat in cats_present]
  297. ds_converted = ds_converted.pr.loc[
  298. {'category': mapping_cats_present}]
  299. from_cats = ds_converted.coords[f'category ({terminology_to})'].values
  300. to_cats = pd.Series(from_cats).replace(conversion['mapping'])
  301. ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
  302. (f'category ({terminology_to})',
  303. to_cats)})
  304. # redo the list of present cats after mapping, as we have new categories in the
  305. # target terminology now
  306. cats_present_mapped = list(ds_converted.coords[f'category ({terminology_to})'])
  307. # aggregate categories
  308. if 'aggregate' in conversion:
  309. aggregate_cats = conversion['aggregate']
  310. for cat_to_agg in aggregate_cats:
  311. if debug:
  312. print(f"Category: {cat_to_agg}")
  313. source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
  314. cat in cats_present_mapped]
  315. if debug:
  316. print(source_cats)
  317. data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
  318. dim='category', skipna=True, min_count=1)
  319. nan_vars = [var for var in data_agg.data_vars if
  320. data_agg[var].isnull().all().data == True]
  321. data_agg = data_agg.drop(nan_vars)
  322. if len(data_agg.data_vars) > 0:
  323. data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
  324. data_agg = data_agg.assign_coords(
  325. coords={f'category ({terminology_to})':
  326. (f'category ({terminology_to})', [cat_to_agg])})
  327. if cat_name_present:
  328. data_agg = data_agg.assign_coords(
  329. coords={'orig_cat_name':
  330. (f'category ({terminology_to})',
  331. [aggregate_cats[cat_to_agg]['name']])})
  332. ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
  333. cats_present_mapped.append(cat_to_agg)
  334. else:
  335. print(f"no data to aggregate category {cat_to_agg}")
  336. return ds_converted
  337. def get_country_name(
  338. country_code: str,
  339. ) -> str:
  340. """get country name from code """
  341. if country_code in custom_country_mapping:
  342. country_name = custom_country_mapping[country_code]
  343. else:
  344. try:
  345. country = pycountry.countries.get(alpha_3=country_code)
  346. country_name = country.name
  347. except:
  348. raise ValueError(f"Country code {country_code} can not be mapped to "
  349. f"any country")
  350. return country_name
  351. def get_country_code(
  352. country_name: str,
  353. )->str:
  354. """
  355. obtain country code. If the input is a code it will be returned,
  356. if the input
  357. is not a three letter code a search will be performed
  358. Parameters
  359. __________
  360. country_name: str
  361. Country code or name to get the three-letter code for.
  362. Returns
  363. -------
  364. country_code: str
  365. """
  366. # First check if it's in the list of custom codes
  367. if country_name in custom_country_mapping:
  368. country_code = country_name
  369. else:
  370. try:
  371. # check if it's a 3 letter UNFCCC_GHG_data
  372. country = pycountry.countries.get(alpha_3=country_name)
  373. country_code = country.alpha_3
  374. except:
  375. try:
  376. country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
  377. except:
  378. raise ValueError(f"Country name {country_name} can not be mapped to "
  379. f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
  380. if len(country) > 1:
  381. country_code = None
  382. for current_country in country:
  383. if current_country.name == country_name:
  384. country_code = current_country.alpha_3
  385. if country_code is None:
  386. raise ValueError(f"Country name {country_name} has {len(country)} "
  387. f"possible results for country codes.")
  388. country_code = country[0].alpha_3
  389. return country_code
  390. def create_folder_mapping(
  391. folder: str,
  392. extracted: bool = False
  393. ) -> None:
  394. """
  395. Create a mapping from 3 letter ISO country codes to folders
  396. based on the subfolders of the given folder. The mapping is
  397. stored in 'folder_mapping.json' in the given folder. Folder
  398. must be given relative to the repository root
  399. Parameters
  400. ----------
  401. folder: str
  402. folder to create the mapping for
  403. extracted: bool = False
  404. If true treat the folder as extracted data, where we
  405. only have one folder per country and no typos in the
  406. names
  407. Returns
  408. -------
  409. Nothing
  410. """
  411. folder = root_path / folder
  412. folder_mapping = {}
  413. #if not extracted:
  414. known_folders = custom_folders
  415. #else:
  416. # known_folders = {}
  417. for item in folder.iterdir():
  418. if item.is_dir() and not item.match("__pycache__"):
  419. if item.name in known_folders:
  420. ISO3 = known_folders[item.name]
  421. else:
  422. try:
  423. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  424. if len(country) > 1:
  425. ISO3 = None
  426. for current_country in country:
  427. if current_country.name == item.name.replace("_", " "):
  428. ISO3 = current_country.alpha_3
  429. else:
  430. ISO3 = country[0].alpha_3
  431. except:
  432. ISO3 = None
  433. if ISO3 is None:
  434. print(f"No match for {item.name}")
  435. else:
  436. if ISO3 in folder_mapping.keys():
  437. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  438. else:
  439. folder_mapping[ISO3] = item.name
  440. with open(folder / "folder_mapping.json", "w") as mapping_file:
  441. json.dump(folder_mapping, mapping_file, indent=4)
  442. # TODO add crf
  443. def get_country_submissions(
  444. country_name: str,
  445. print_sub: bool = True,
  446. ) -> Dict[str, List[str]]:
  447. """
  448. Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
  449. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  450. queries the folder mapping files for folders.
  451. Parameters
  452. ----------
  453. country_name: str
  454. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  455. print_sub: bool
  456. If True information on submissions will be written to stdout
  457. Returns
  458. -------
  459. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  460. Each value is a list of folders
  461. """
  462. data_folder = downloaded_data_path
  463. country_code = get_country_code(country_name)
  464. if print_sub:
  465. print(f"Country name {country_name} maps to ISO code {country_code}")
  466. country_submissions = {}
  467. if print_sub:
  468. print(f"#" * 80)
  469. print(f"The following submissions are available for {country_name}")
  470. for item in data_folder.iterdir():
  471. if item.is_dir():
  472. if print_sub:
  473. print("")
  474. print("-" * 80)
  475. print(f"Data folder {item.name}")
  476. print("-" * 80)
  477. with open(item / "folder_mapping.json", "r") as mapping_file:
  478. folder_mapping = json.load(mapping_file)
  479. if country_code in folder_mapping:
  480. country_folders = folder_mapping[country_code]
  481. if isinstance(country_folders, str):
  482. # only one folder
  483. country_folders = [country_folders]
  484. submission_folders = []
  485. for country_folder in country_folders:
  486. current_folder = item / country_folder
  487. if print_sub:
  488. print(f"Submissions in folder {country_folder}:")
  489. for submission_folder in current_folder.iterdir():
  490. if submission_folder.is_dir():
  491. if print_sub:
  492. print(submission_folder.name)
  493. submission_folders.append(submission_folder.name)
  494. country_submissions[item.name] = submission_folders
  495. else:
  496. print(f"No submissions available for {country_name}.")
  497. return country_submissions
  498. def get_country_datasets(
  499. country_name: str,
  500. print_ds: bool = True,
  501. ) -> Dict[str, List[str]]:
  502. """
  503. Input is a three letter ISO code for a country, or the country's name.
  504. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  505. checks the UNFCCC_GHG_data and data folders for content on the country.
  506. Parameters
  507. ----------
  508. country_name: str
  509. String containing the country name or ISO 3 letter code
  510. print_ds: bool
  511. If True information on submissions will be written to stdout
  512. Returns
  513. -------
  514. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  515. Each value is a list of folders
  516. """
  517. data_folder = extracted_data_path
  518. data_folder_legacy = legacy_data_path
  519. # obtain country UNFCCC_GHG_data
  520. country_code = get_country_code(country_name)
  521. if print_ds:
  522. print(f"Country name {country_name} maps to ISO code {country_code}")
  523. rep_data = {}
  524. # data
  525. if print_ds:
  526. print(f"#" * 80)
  527. print(f"The following datasets are available for {country_name}")
  528. for item in data_folder.iterdir():
  529. if item.is_dir():
  530. cleaned_datasets_current_folder = {}
  531. if print_ds:
  532. print("-" * 80)
  533. print(f"Data folder {item.name}")
  534. print("-" * 80)
  535. with open(item / "folder_mapping.json", "r") as mapping_file:
  536. folder_mapping = json.load(mapping_file)
  537. if country_code not in folder_mapping:
  538. if print_ds:
  539. print("No data available")
  540. print("")
  541. else:
  542. country_folder = folder_mapping[country_code]
  543. if not isinstance(country_folder, str):
  544. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  545. datasets_current_folder = {}
  546. current_folder = item / country_folder
  547. for data_file in current_folder.iterdir():
  548. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  549. if data_file.stem in datasets_current_folder:
  550. datasets_current_folder[data_file.stem].append(data_file.suffix)
  551. else:
  552. datasets_current_folder[data_file.stem] = [data_file.suffix]
  553. for dataset in datasets_current_folder:
  554. # process filename to get submission
  555. parts = dataset.split('_')
  556. if parts[0] != country_code:
  557. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
  558. dataset
  559. else:
  560. terminology = "_".join(parts[3 : ])
  561. key = f"{parts[1]} ({parts[2]}, {terminology})"
  562. data_info = ""
  563. if '.nc' in datasets_current_folder[dataset]:
  564. data_info = data_info + "NF (.nc), "
  565. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  566. data_info = data_info + "IF (.yaml + .csv), "
  567. elif '.csv' in datasets_current_folder[dataset]:
  568. data_info = data_info + "incomplete IF? (.csv), "
  569. elif '.yaml' in datasets_current_folder[dataset]:
  570. data_info = data_info + "incomplete IF (.yaml), "
  571. code_file = get_code_file(country_code, parts[1])
  572. if code_file:
  573. data_info = data_info + f"code: {code_file.name}"
  574. else:
  575. data_info = data_info + f"code: not found"
  576. cleaned_datasets_current_folder[key] = data_info
  577. if print_ds:
  578. if cleaned_datasets_current_folder:
  579. for country_ds in cleaned_datasets_current_folder:
  580. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  581. else:
  582. print("No data available")
  583. print("")
  584. rep_data[item.name] = cleaned_datasets_current_folder
  585. # legacy data
  586. if print_ds:
  587. print(f"#" * 80)
  588. print(f"The following legacy datasets are available for {country_name}")
  589. legacy_data = {}
  590. for item in data_folder_legacy.iterdir():
  591. if item.is_dir():
  592. cleaned_datasets_current_folder = {}
  593. if print_ds:
  594. print("-" * 80)
  595. print(f"Data folder {item.name}")
  596. print("-" * 80)
  597. with open(item / "folder_mapping.json", "r") as mapping_file:
  598. folder_mapping = json.load(mapping_file)
  599. if country_code not in folder_mapping:
  600. if print_ds:
  601. print("No data available")
  602. print("")
  603. else:
  604. country_folder = folder_mapping[country_code]
  605. if not isinstance(country_folder, str):
  606. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  607. datasets_current_folder = {}
  608. current_folder = item / country_folder
  609. for data_file in current_folder.iterdir():
  610. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  611. if data_file.stem in datasets_current_folder:
  612. datasets_current_folder[data_file.stem].append(data_file.suffix)
  613. else:
  614. datasets_current_folder[data_file.stem] = [data_file.suffix]
  615. for dataset in datasets_current_folder:
  616. # process filename to get submission
  617. parts = dataset.split('_')
  618. if parts[0] != country_code:
  619. cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
  620. else:
  621. terminology = "_".join(parts[3 : ])
  622. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  623. data_info = ""
  624. if '.nc' in datasets_current_folder[dataset]:
  625. data_info = data_info + "NF (.nc), "
  626. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  627. data_info = data_info + "IF (.yaml + .csv), "
  628. elif '.csv' in datasets_current_folder[dataset]:
  629. data_info = data_info + "incomplete IF? (.csv), "
  630. elif '.yaml' in datasets_current_folder[dataset]:
  631. data_info = data_info + "incomplete IF (.yaml), "
  632. cleaned_datasets_current_folder[key] = data_info
  633. if print_ds:
  634. if cleaned_datasets_current_folder:
  635. for country_ds in cleaned_datasets_current_folder:
  636. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  637. else:
  638. print("No data available")
  639. print("")
  640. legacy_data[item.name] = cleaned_datasets_current_folder
  641. all_data = {
  642. "rep_data": rep_data,
  643. "legacy_data": legacy_data,
  644. }
  645. return all_data
  646. def get_code_file(
  647. country_name: str,
  648. submission: str,
  649. print_info: bool = False,
  650. ) -> Path:
  651. """
  652. For given country name and submission find the script that creates the data
  653. Parameters
  654. ----------
  655. country_name: str
  656. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  657. submission: str
  658. String of the submission
  659. print_info: bool = False
  660. If True print information on UNFCCC_GHG_data found
  661. Returns
  662. -------
  663. returns a pathlib Path object for the UNFCCC_GHG_data file
  664. """
  665. code_file_path = None
  666. UNFCCC_reader_path = code_path / "UNFCCC_reader"
  667. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  668. # so we return the path to that.
  669. if submission[0:3] == "CRF":
  670. return root_path / "UNFCCC_CRF_reader"
  671. if submission[0:2] == "DI":
  672. return root_path / "UNFCCC_DI_reader"
  673. # obtain country UNFCCC_GHG_data
  674. country_code = get_country_code(country_name)
  675. if print_info:
  676. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  677. with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
  678. folder_mapping = json.load(mapping_file)
  679. if country_code not in folder_mapping:
  680. if print_info:
  681. print("No UNFCCC_GHG_data available")
  682. print("")
  683. else:
  684. country_folder = UNFCCC_reader_path / folder_mapping[country_code]
  685. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  686. for file in country_folder.iterdir():
  687. if file.match(code_file_name_candidate):
  688. if code_file_path is not None:
  689. raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
  690. f"{code_file_path} and file.name. "
  691. f"Please use only one file with name "
  692. f"'read_ISO3_submission_XXX.YYY'.")
  693. else:
  694. if print_info:
  695. print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
  696. code_file_path = file
  697. if code_file_path is not None:
  698. return code_file_path.relative_to(root_path)
  699. else:
  700. return None