functions.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. import copy
  2. import pycountry
  3. import json
  4. import re
  5. import xarray as xr
  6. import pandas as pd
  7. import numpy as np
  8. from datetime import date
  9. from copy import deepcopy
  10. from typing import Dict, List, Optional
  11. from pathlib import Path
  12. from .definitions import custom_country_mapping, custom_folders
  13. from .definitions import root_path, downloaded_data_path, extracted_data_path
  14. from .definitions import legacy_data_path, code_path
  15. from .definitions import GWP_factors
  16. def process_data_for_country(
  17. data_country: xr.Dataset,
  18. entities_to_ignore: List[str],
  19. gas_baskets: Dict[str, List[str]],
  20. filter_dims: Optional[Dict[str, List[str]]] = None,
  21. cat_terminology_out: Optional[str] = None,
  22. category_conversion: Dict[str, Dict] = None,
  23. sectors_out: List[str] = None,
  24. processing_info_country: Dict = None,
  25. ) -> xr.Dataset:
  26. """
  27. Process data from DI interface (where necessary).
  28. * Downscaling including subtraction of time series
  29. * country specific sector aggregation
  30. * Conversion to IPCC2006 categories
  31. * general sector and gas basket aggregation (in new categories)
  32. """
  33. # 0: gather information
  34. countries = list(data_country.coords[data_country.attrs['area']].values)
  35. if len(countries) > 1:
  36. raise ValueError(
  37. f"Found {len(countries)} countries. Only single country data "
  38. f"can be processed by this function. countries: {countries}")
  39. else:
  40. country_code = countries[0]
  41. # get category terminology
  42. cat_col = data_country.attrs['cat']
  43. temp = re.findall(r'\((.*)\)', cat_col)
  44. cat_terminology_in = temp[0]
  45. # get scenario
  46. scenarios = list(data_country.coords[data_country.attrs['scen']].values)
  47. if len(scenarios) > 1:
  48. raise ValueError(
  49. f"Found {len(scenarios)} scenarios. Only single scenario data "
  50. f"can be processed by this function. Scenarios: {scenarios}")
  51. scenario = scenarios[0]
  52. # get source
  53. sources = list(data_country.coords['source'].values)
  54. if len(sources) > 1:
  55. raise ValueError(
  56. f"Found {len(sources)} sources. Only single source data "
  57. f"can be processed by this function. Sources: {sources}")
  58. source = sources[0]
  59. # check if category name column present
  60. # TODO: replace 'name' in config by 'additional_cols' dict that defines the cols
  61. # and the values
  62. if 'orig_cat_name' in data_country.coords:
  63. cat_name_present = True
  64. else:
  65. cat_name_present = False
  66. # 1: general processing
  67. # remove unused cats
  68. data_country = data_country.dropna(f'category ({cat_terminology_in})', how='all')
  69. # remove unused years
  70. data_country = data_country.dropna(f'time', how='all')
  71. # remove variables only containing nan
  72. nan_vars_country = [var for var in data_country.data_vars if
  73. bool(data_country[var].isnull().all().data) is True]
  74. print(f"removing all-nan variables: {nan_vars_country}")
  75. data_country = data_country.drop_vars(nan_vars_country)
  76. # remove unnecessary variables
  77. entities_ignore_present = [entity for entity in entities_to_ignore if
  78. entity in data_country.data_vars]
  79. data_country = data_country.drop_vars(entities_ignore_present)
  80. # filter ()
  81. if filter_dims is not None:
  82. data_country = data_country.pr.loc[filter_dims]
  83. # 2: country specific processing
  84. if processing_info_country is not None:
  85. if 'tolerance' in processing_info_country:
  86. tolerance = processing_info_country["tolerance"]
  87. else:
  88. tolerance = 0.01
  89. # remove entities if needed
  90. if 'ignore_entities' in processing_info_country:
  91. entities_to_ignore_country = processing_info_country[
  92. 'ignore_entities']
  93. entities_ignore_present = \
  94. [entity for entity in entities_to_ignore_country if
  95. entity in data_country.data_vars]
  96. data_country = data_country.drop_vars(entities_ignore_present)
  97. # take only desired years
  98. if 'years' in processing_info_country:
  99. data_country = data_country.pr.loc[
  100. {'time': processing_info_country['years']}]
  101. # remove timeseries if desired
  102. if 'remove_ts' in processing_info_country:
  103. for case in processing_info_country['remove_ts']:
  104. remove_info = copy.deepcopy(processing_info_country['remove_ts'][case])
  105. entities = remove_info.pop("entities")
  106. for entity in entities:
  107. data_country[entity].pr.loc[remove_info] = \
  108. data_country[entity].pr.loc[remove_info] * np.nan
  109. # remove all data for given years if necessary
  110. if 'remove_years' in processing_info_country:
  111. data_country = data_country.drop_sel(
  112. time=processing_info_country['remove_years'])
  113. # subtract categories
  114. if 'subtract_cats' in processing_info_country:
  115. subtract_cats_current = processing_info_country['subtract_cats']
  116. print(f"Subtracting categories for country {country_code}")
  117. for cat_to_generate in subtract_cats_current:
  118. if 'entities' in subtract_cats_current[cat_to_generate].keys():
  119. entities_current = subtract_cats_current[cat_to_generate]['entities']
  120. else:
  121. entities_current = list(data_country.data_vars)
  122. cats_to_subtract = \
  123. subtract_cats_current[cat_to_generate]['subtract']
  124. data_sub = \
  125. data_country[entities_current].pr.loc[
  126. {'category': cats_to_subtract}].pr.sum(
  127. dim='category', skipna=True, min_count=1)
  128. data_parent = data_country[entities_current].pr.loc[
  129. {'category': subtract_cats_current[cat_to_generate]['parent']}]
  130. data_agg = data_parent - data_sub
  131. nan_vars = [var for var in data_agg.data_vars if
  132. data_agg[var].isnull().all().data is True]
  133. data_agg = data_agg.drop(nan_vars)
  134. if len(data_agg.data_vars) > 0:
  135. print(f"Generating {cat_to_generate} through subtraction")
  136. data_agg = data_agg.expand_dims([f'category ('
  137. f'{cat_terminology_in})'])
  138. data_agg = data_agg.assign_coords(
  139. coords={f'category ({cat_terminology_in})':
  140. (f'category ({cat_terminology_in})',
  141. [cat_to_generate])})
  142. if cat_name_present:
  143. cat_name = subtract_cats_current[cat_to_generate]['name']
  144. data_agg = data_agg.assign_coords(
  145. coords={'orig_cat_name':
  146. (f'category ({cat_terminology_in})',
  147. [cat_name])})
  148. data_country = data_country.pr.merge(data_agg,
  149. tolerance=tolerance)
  150. else:
  151. print(f"no data to generate category {cat_to_generate}")
  152. # downscaling
  153. if 'downscale' in processing_info_country:
  154. if 'sectors' in processing_info_country['downscale']:
  155. sector_downscaling = \
  156. processing_info_country['downscale']['sectors']
  157. for case in sector_downscaling.keys():
  158. print(f"Downscaling for {case}.")
  159. sector_downscaling_current = sector_downscaling[case]
  160. entities = sector_downscaling_current.pop('entities')
  161. for entity in entities:
  162. data_country[entity] = data_country[
  163. entity].pr.downscale_timeseries(
  164. **sector_downscaling_current)
  165. # , skipna_evaluation_dims=None)
  166. if 'entities' in processing_info_country['downscale']:
  167. entity_downscaling = \
  168. processing_info_country['downscale']['entities']
  169. for case in entity_downscaling.keys():
  170. print(f"Downscaling for {case}.")
  171. # print(data_country.coords[f'category ('
  172. # f'{cat_terminology_in})'].values)
  173. data_country = data_country.pr.downscale_gas_timeseries(
  174. **entity_downscaling[case], skipna=True,
  175. skipna_evaluation_dims=None)
  176. # aggregate categories
  177. if 'aggregate_cats' in processing_info_country:
  178. if 'agg_tolerance' in processing_info_country:
  179. agg_tolerance = processing_info_country['agg_tolerance']
  180. else:
  181. agg_tolerance = tolerance
  182. aggregate_cats_current = processing_info_country['aggregate_cats']
  183. print(
  184. f"Aggregating categories for country {country_code}, source {source}, "
  185. f"scenario {scenario}")
  186. for cat_to_agg in aggregate_cats_current:
  187. print(f"Category: {cat_to_agg}")
  188. source_cats = aggregate_cats_current[cat_to_agg]['sources']
  189. data_agg = data_country.pr.loc[{'category': source_cats}].pr.sum(
  190. dim='category', skipna=True, min_count=1)
  191. nan_vars = [var for var in data_agg.data_vars if
  192. data_agg[var].isnull().all().data is True]
  193. data_agg = data_agg.drop(nan_vars)
  194. if len(data_agg.data_vars) > 0:
  195. data_agg = data_agg.expand_dims([f'category ('
  196. f'{cat_terminology_in})'])
  197. data_agg = data_agg.assign_coords(
  198. coords={f'category ({cat_terminology_in})':
  199. (f'category ({cat_terminology_in})',
  200. [cat_to_agg])})
  201. if cat_name_present:
  202. cat_name = aggregate_cats_current[cat_to_agg]['name']
  203. data_agg = data_agg.assign_coords(
  204. coords={'orig_cat_name':
  205. (f'category ({cat_terminology_in})',
  206. [cat_name])})
  207. data_country = data_country.pr.merge(data_agg,
  208. tolerance=agg_tolerance)
  209. else:
  210. print(f"no data to aggregate category {cat_to_agg}")
  211. # copy HFCs and PFCs with default factors
  212. if 'basket_copy' in processing_info_country:
  213. GWPs_to_add = processing_info_country["basket_copy"]["GWPs_to_add"]
  214. entities = processing_info_country["basket_copy"]["entities"]
  215. source_GWP = processing_info_country["basket_copy"]["source_GWP"]
  216. for entity in entities:
  217. data_source = data_country[f'{entity} ({source_GWP})']
  218. for GWP in GWPs_to_add:
  219. data_GWP = data_source * \
  220. GWP_factors[f"{source_GWP}_to_{GWP}"][entity]
  221. data_GWP.attrs["entity"] = entity
  222. data_GWP.attrs["gwp_context"] = GWP
  223. data_country[f"{entity} ({GWP})"] = data_GWP
  224. # aggregate gases if desired
  225. if 'aggregate_gases' in processing_info_country:
  226. # TODO: why use different code here than below. Can this fill non-existen
  227. # gas baskets?
  228. for case in processing_info_country['aggregate_gases'].keys():
  229. case_info = processing_info_country['aggregate_gases'][case]
  230. data_country[case_info['basket']] = \
  231. data_country.pr.fill_na_gas_basket_from_contents(
  232. **case_info)
  233. # 3: map categories
  234. if category_conversion is not None:
  235. data_country = convert_categories(
  236. data_country,
  237. category_conversion,
  238. cat_terminology_out,
  239. debug=False,
  240. tolerance=0.01,
  241. )
  242. else:
  243. cat_terminology_out = cat_terminology_in
  244. # more general processing
  245. # reduce categories to output cats
  246. if sectors_out is not None:
  247. cats_to_keep = [cat for cat in
  248. data_country.coords[f'category ({cat_terminology_out})'].values
  249. if cat in sectors_out]
  250. data_country = data_country.pr.loc[{'category': cats_to_keep}]
  251. # create gas baskets
  252. entities_present = set(data_country.data_vars)
  253. for basket in gas_baskets.keys():
  254. basket_contents_present = [gas for gas in gas_baskets[basket] if
  255. gas in entities_present]
  256. if len(basket_contents_present) > 0:
  257. if basket in list(data_country.data_vars):
  258. data_country[basket] = data_country.pr.fill_na_gas_basket_from_contents(
  259. basket=basket, basket_contents=basket_contents_present,
  260. skipna=True, min_count=1)
  261. else:
  262. try:
  263. #print(data_country.data_vars)
  264. data_country[basket] = xr.full_like(data_country["CO2"],
  265. np.nan).pr.quantify(
  266. units="Gg CO2 / year")
  267. data_country[basket].attrs = {"entity": basket.split(' ')[0],
  268. "gwp_context": basket.split(' ')[1][
  269. 1:-1]}
  270. data_country[basket] = data_country.pr.gas_basket_contents_sum(
  271. basket=basket, basket_contents=basket_contents_present,
  272. min_count=1)
  273. entities_present.add(basket)
  274. except Exception as ex:
  275. print(f"No gas basket created for {country_code}, {source}, "
  276. f"{scenario}: {ex}")
  277. # amend title and comment
  278. data_country.attrs["comment"] = data_country.attrs["comment"] + f" Processed on " \
  279. f"{date.today()}"
  280. data_country.attrs["title"] = data_country.attrs["title"] + f" Processed on " \
  281. f"{date.today()}"
  282. return data_country
  283. def convert_categories(
  284. ds_input: xr.Dataset,
  285. conversion: Dict[str, Dict[str, str]],
  286. #terminology_from: str,
  287. terminology_to: str,
  288. debug: bool=False,
  289. tolerance: float=0.01,
  290. )->xr.Dataset:
  291. """
  292. convert data from one category terminology to another
  293. """
  294. print(f"converting categories to {terminology_to}")
  295. if 'orig_cat_name' in ds_input.coords:
  296. cat_name_present = True
  297. else:
  298. cat_name_present = False
  299. ds_converted = ds_input.copy(deep=True)
  300. ds_converted.attrs = deepcopy(ds_input.attrs)
  301. # TODO: change attrs for additional coordinates
  302. # change category terminology
  303. cat_dim = ds_converted.attrs["cat"]
  304. ds_converted.attrs["cat"] = f"category ({terminology_to})"
  305. ds_converted = ds_converted.rename({cat_dim: ds_converted.attrs["cat"]})
  306. # find categories present in dataset
  307. cats_present = list(ds_converted.coords[f'category ({terminology_to})'])
  308. # restrict categories and map category names
  309. if 'mapping' in conversion.keys():
  310. mapping_cats_present = [cat for cat in list(conversion['mapping'].keys()) if
  311. cat in cats_present]
  312. ds_converted = ds_converted.pr.loc[
  313. {'category': mapping_cats_present}]
  314. from_cats = ds_converted.coords[f'category ({terminology_to})'].values
  315. to_cats = pd.Series(from_cats).replace(conversion['mapping'])
  316. ds_converted = ds_converted.assign_coords({f'category ({terminology_to})':
  317. (f'category ({terminology_to})',
  318. to_cats)})
  319. # redo the list of present cats after mapping, as we have new categories in the
  320. # target terminology now
  321. cats_present_mapped = list(ds_converted.coords[f'category ('
  322. f'{terminology_to})'].values)
  323. # aggregate categories
  324. if 'aggregate' in conversion:
  325. aggregate_cats = conversion['aggregate']
  326. for cat_to_agg in aggregate_cats:
  327. if debug:
  328. print(f"Category: {cat_to_agg}")
  329. source_cats = [cat for cat in aggregate_cats[cat_to_agg]['sources'] if
  330. cat in cats_present_mapped]
  331. if debug:
  332. print(source_cats)
  333. data_agg = ds_converted.pr.loc[{'category': source_cats}].pr.sum(
  334. dim='category', skipna=True, min_count=1)
  335. nan_vars = [var for var in data_agg.data_vars if
  336. data_agg[var].isnull().all().data == True]
  337. data_agg = data_agg.drop(nan_vars)
  338. if len(data_agg.data_vars) > 0:
  339. data_agg = data_agg.expand_dims([f'category ({terminology_to})'])
  340. data_agg = data_agg.assign_coords(
  341. coords={f'category ({terminology_to})':
  342. (f'category ({terminology_to})', [cat_to_agg])})
  343. if cat_name_present:
  344. data_agg = data_agg.assign_coords(
  345. coords={'orig_cat_name':
  346. (f'category ({terminology_to})',
  347. [aggregate_cats[cat_to_agg]['name']])})
  348. ds_converted = ds_converted.pr.merge(data_agg, tolerance=tolerance)
  349. cats_present_mapped.append(cat_to_agg)
  350. else:
  351. print(f"no data to aggregate category {cat_to_agg}")
  352. return ds_converted
  353. def get_country_name(
  354. country_code: str,
  355. ) -> str:
  356. """get country name from code """
  357. if country_code in custom_country_mapping:
  358. country_name = custom_country_mapping[country_code]
  359. else:
  360. try:
  361. country = pycountry.countries.get(alpha_3=country_code)
  362. country_name = country.name
  363. except:
  364. raise ValueError(f"Country code {country_code} can not be mapped to "
  365. f"any country")
  366. return country_name
  367. def get_country_code(
  368. country_name: str,
  369. )->str:
  370. """
  371. obtain country code. If the input is a code it will be returned,
  372. if the input
  373. is not a three letter code a search will be performed
  374. Parameters
  375. __________
  376. country_name: str
  377. Country code or name to get the three-letter code for.
  378. Returns
  379. -------
  380. country_code: str
  381. """
  382. # First check if it's in the list of custom codes
  383. if country_name in custom_country_mapping:
  384. country_code = country_name
  385. else:
  386. try:
  387. # check if it's a 3 letter UNFCCC_GHG_data
  388. country = pycountry.countries.get(alpha_3=country_name)
  389. country_code = country.alpha_3
  390. except:
  391. try:
  392. country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
  393. except:
  394. raise ValueError(f"Country name {country_name} can not be mapped to "
  395. f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
  396. if len(country) > 1:
  397. country_code = None
  398. for current_country in country:
  399. if current_country.name == country_name:
  400. country_code = current_country.alpha_3
  401. if country_code is None:
  402. raise ValueError(f"Country name {country_name} has {len(country)} "
  403. f"possible results for country codes.")
  404. country_code = country[0].alpha_3
  405. return country_code
  406. def create_folder_mapping(
  407. folder: str,
  408. extracted: bool = False
  409. ) -> None:
  410. """
  411. Create a mapping from 3 letter ISO country codes to folders
  412. based on the subfolders of the given folder. The mapping is
  413. stored in 'folder_mapping.json' in the given folder. Folder
  414. must be given relative to the repository root
  415. Parameters
  416. ----------
  417. folder: str
  418. folder to create the mapping for
  419. extracted: bool = False
  420. If true treat the folder as extracted data, where we
  421. only have one folder per country and no typos in the
  422. names
  423. Returns
  424. -------
  425. Nothing
  426. """
  427. folder = root_path / folder
  428. folder_mapping = {}
  429. #if not extracted:
  430. known_folders = custom_folders
  431. #else:
  432. # known_folders = {}
  433. for item in folder.iterdir():
  434. if item.is_dir() and not item.match("__pycache__"):
  435. if item.name in known_folders:
  436. ISO3 = known_folders[item.name]
  437. else:
  438. try:
  439. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  440. if len(country) > 1:
  441. ISO3 = None
  442. for current_country in country:
  443. if current_country.name == item.name.replace("_", " "):
  444. ISO3 = current_country.alpha_3
  445. else:
  446. ISO3 = country[0].alpha_3
  447. except:
  448. ISO3 = None
  449. if ISO3 is None:
  450. print(f"No match for {item.name}")
  451. else:
  452. if ISO3 in folder_mapping.keys():
  453. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  454. else:
  455. folder_mapping[ISO3] = item.name
  456. with open(folder / "folder_mapping.json", "w") as mapping_file:
  457. json.dump(folder_mapping, mapping_file, indent=4)
  458. # TODO add crf
  459. def get_country_submissions(
  460. country_name: str,
  461. print_sub: bool = True,
  462. ) -> Dict[str, List[str]]:
  463. """
  464. Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
  465. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  466. queries the folder mapping files for folders.
  467. Parameters
  468. ----------
  469. country_name: str
  470. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  471. print_sub: bool
  472. If True information on submissions will be written to stdout
  473. Returns
  474. -------
  475. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  476. Each value is a list of folders
  477. """
  478. data_folder = downloaded_data_path
  479. country_code = get_country_code(country_name)
  480. if print_sub:
  481. print(f"Country name {country_name} maps to ISO code {country_code}")
  482. country_submissions = {}
  483. if print_sub:
  484. print(f"#" * 80)
  485. print(f"The following submissions are available for {country_name}")
  486. for item in data_folder.iterdir():
  487. if item.is_dir():
  488. if print_sub:
  489. print("")
  490. print("-" * 80)
  491. print(f"Data folder {item.name}")
  492. print("-" * 80)
  493. with open(item / "folder_mapping.json", "r") as mapping_file:
  494. folder_mapping = json.load(mapping_file)
  495. if country_code in folder_mapping:
  496. country_folders = folder_mapping[country_code]
  497. if isinstance(country_folders, str):
  498. # only one folder
  499. country_folders = [country_folders]
  500. submission_folders = []
  501. for country_folder in country_folders:
  502. current_folder = item / country_folder
  503. if print_sub:
  504. print(f"Submissions in folder {country_folder}:")
  505. for submission_folder in current_folder.iterdir():
  506. if submission_folder.is_dir():
  507. if print_sub:
  508. print(submission_folder.name)
  509. submission_folders.append(submission_folder.name)
  510. country_submissions[item.name] = submission_folders
  511. else:
  512. print(f"No submissions available for {country_name}.")
  513. return country_submissions
  514. def get_country_datasets(
  515. country_name: str,
  516. print_ds: bool = True,
  517. ) -> Dict[str, List[str]]:
  518. """
  519. Input is a three letter ISO code for a country, or the country's name.
  520. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  521. checks the UNFCCC_GHG_data and data folders for content on the country.
  522. Parameters
  523. ----------
  524. country_name: str
  525. String containing the country name or ISO 3 letter code
  526. print_ds: bool
  527. If True information on submissions will be written to stdout
  528. Returns
  529. -------
  530. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  531. Each value is a list of folders
  532. """
  533. data_folder = extracted_data_path
  534. data_folder_legacy = legacy_data_path
  535. # obtain country UNFCCC_GHG_data
  536. country_code = get_country_code(country_name)
  537. if print_ds:
  538. print(f"Country name {country_name} maps to ISO code {country_code}")
  539. rep_data = {}
  540. # data
  541. if print_ds:
  542. print(f"#" * 80)
  543. print(f"The following datasets are available for {country_name}")
  544. for item in data_folder.iterdir():
  545. if item.is_dir():
  546. cleaned_datasets_current_folder = {}
  547. if print_ds:
  548. print("-" * 80)
  549. print(f"Data folder {item.name}")
  550. print("-" * 80)
  551. with open(item / "folder_mapping.json", "r") as mapping_file:
  552. folder_mapping = json.load(mapping_file)
  553. if country_code not in folder_mapping:
  554. if print_ds:
  555. print("No data available")
  556. print("")
  557. else:
  558. country_folder = folder_mapping[country_code]
  559. if not isinstance(country_folder, str):
  560. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  561. datasets_current_folder = {}
  562. current_folder = item / country_folder
  563. for data_file in current_folder.iterdir():
  564. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  565. if data_file.stem in datasets_current_folder:
  566. datasets_current_folder[data_file.stem].append(data_file.suffix)
  567. else:
  568. datasets_current_folder[data_file.stem] = [data_file.suffix]
  569. for dataset in datasets_current_folder:
  570. # process filename to get submission
  571. parts = dataset.split('_')
  572. if parts[0] != country_code:
  573. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] =\
  574. dataset
  575. else:
  576. terminology = "_".join(parts[3 : ])
  577. key = f"{parts[1]} ({parts[2]}, {terminology})"
  578. data_info = ""
  579. if '.nc' in datasets_current_folder[dataset]:
  580. data_info = data_info + "NF (.nc), "
  581. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  582. data_info = data_info + "IF (.yaml + .csv), "
  583. elif '.csv' in datasets_current_folder[dataset]:
  584. data_info = data_info + "incomplete IF? (.csv), "
  585. elif '.yaml' in datasets_current_folder[dataset]:
  586. data_info = data_info + "incomplete IF (.yaml), "
  587. code_file = get_code_file(country_code, parts[1])
  588. if code_file:
  589. data_info = data_info + f"code: {code_file.name}"
  590. else:
  591. data_info = data_info + f"code: not found"
  592. cleaned_datasets_current_folder[key] = data_info
  593. if print_ds:
  594. if cleaned_datasets_current_folder:
  595. for country_ds in cleaned_datasets_current_folder:
  596. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  597. else:
  598. print("No data available")
  599. print("")
  600. rep_data[item.name] = cleaned_datasets_current_folder
  601. # legacy data
  602. if print_ds:
  603. print(f"#" * 80)
  604. print(f"The following legacy datasets are available for {country_name}")
  605. legacy_data = {}
  606. for item in data_folder_legacy.iterdir():
  607. if item.is_dir():
  608. cleaned_datasets_current_folder = {}
  609. if print_ds:
  610. print("-" * 80)
  611. print(f"Data folder {item.name}")
  612. print("-" * 80)
  613. with open(item / "folder_mapping.json", "r") as mapping_file:
  614. folder_mapping = json.load(mapping_file)
  615. if country_code not in folder_mapping:
  616. if print_ds:
  617. print("No data available")
  618. print("")
  619. else:
  620. country_folder = folder_mapping[country_code]
  621. if not isinstance(country_folder, str):
  622. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  623. datasets_current_folder = {}
  624. current_folder = item / country_folder
  625. for data_file in current_folder.iterdir():
  626. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  627. if data_file.stem in datasets_current_folder:
  628. datasets_current_folder[data_file.stem].append(data_file.suffix)
  629. else:
  630. datasets_current_folder[data_file.stem] = [data_file.suffix]
  631. for dataset in datasets_current_folder:
  632. # process filename to get submission
  633. parts = dataset.split('_')
  634. if parts[0] != country_code:
  635. cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
  636. else:
  637. terminology = "_".join(parts[3 : ])
  638. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  639. data_info = ""
  640. if '.nc' in datasets_current_folder[dataset]:
  641. data_info = data_info + "NF (.nc), "
  642. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  643. data_info = data_info + "IF (.yaml + .csv), "
  644. elif '.csv' in datasets_current_folder[dataset]:
  645. data_info = data_info + "incomplete IF? (.csv), "
  646. elif '.yaml' in datasets_current_folder[dataset]:
  647. data_info = data_info + "incomplete IF (.yaml), "
  648. cleaned_datasets_current_folder[key] = data_info
  649. if print_ds:
  650. if cleaned_datasets_current_folder:
  651. for country_ds in cleaned_datasets_current_folder:
  652. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  653. else:
  654. print("No data available")
  655. print("")
  656. legacy_data[item.name] = cleaned_datasets_current_folder
  657. all_data = {
  658. "rep_data": rep_data,
  659. "legacy_data": legacy_data,
  660. }
  661. return all_data
  662. def get_code_file(
  663. country_name: str,
  664. submission: str,
  665. print_info: bool = False,
  666. ) -> Path:
  667. """
  668. For given country name and submission find the script that creates the data
  669. Parameters
  670. ----------
  671. country_name: str
  672. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  673. submission: str
  674. String of the submission
  675. print_info: bool = False
  676. If True print information on UNFCCC_GHG_data found
  677. Returns
  678. -------
  679. returns a pathlib Path object for the UNFCCC_GHG_data file
  680. """
  681. code_file_path = None
  682. UNFCCC_reader_path = code_path / "UNFCCC_reader"
  683. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  684. # so we return the path to that.
  685. if submission[0:3] == "CRF":
  686. return root_path / "UNFCCC_CRF_reader"
  687. if submission[0:2] == "DI":
  688. return root_path / "UNFCCC_DI_reader"
  689. # obtain country UNFCCC_GHG_data
  690. country_code = get_country_code(country_name)
  691. if print_info:
  692. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  693. with open(UNFCCC_reader_path / "folder_mapping.json", "r") as mapping_file:
  694. folder_mapping = json.load(mapping_file)
  695. if country_code not in folder_mapping:
  696. if print_info:
  697. print("No UNFCCC_GHG_data available")
  698. print("")
  699. else:
  700. country_folder = UNFCCC_reader_path / folder_mapping[country_code]
  701. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  702. for file in country_folder.iterdir():
  703. if file.match(code_file_name_candidate):
  704. if code_file_path is not None:
  705. raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
  706. f"{code_file_path} and file.name. "
  707. f"Please use only one file with name "
  708. f"'read_ISO3_submission_XXX.YYY'.")
  709. else:
  710. if print_info:
  711. print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
  712. code_file_path = file
  713. if code_file_path is not None:
  714. return code_file_path.relative_to(root_path)
  715. else:
  716. return None
  717. def fix_rows(data: pd.DataFrame, rows_to_fix: list, col_to_use: str, n_rows: int)->pd.DataFrame:
  718. '''
  719. Function to fix rows that have been split during reading from pdf
  720. This is the version used for Malaysia BUR3,4. adapt for other BURs if needed
  721. :param data:
  722. :param rows_to_fix:
  723. :param col_to_use:
  724. :param n_rows:
  725. :return:
  726. '''
  727. for row in rows_to_fix:
  728. #print(row)
  729. # find the row number and collect the row and the next two rows
  730. index = data.loc[data[col_to_use] == row].index
  731. #print(list(index))
  732. if not list(index):
  733. print(f"Can't merge split row {row}")
  734. print(data[col_to_use])
  735. #print(f"Merging split row {row} for table {page}")
  736. loc = data.index.get_loc(index[0])
  737. if n_rows == -3:
  738. locs_to_merge = list(range(loc - 1, loc + 2))
  739. elif n_rows == -5:
  740. locs_to_merge = list(range(loc - 1, loc + 4))
  741. else:
  742. locs_to_merge = list(range(loc, loc + n_rows))
  743. rows_to_merge = data.iloc[locs_to_merge]
  744. indices_to_merge = rows_to_merge.index
  745. # join the three rows
  746. new_row = rows_to_merge.agg(' '.join)
  747. # replace the double spaces that are created
  748. # must be done here and not at the end as splits are not always
  749. # the same and join would produce different col values
  750. new_row = new_row.str.replace(" ", " ")
  751. new_row = new_row.str.replace("N O", "NO")
  752. new_row = new_row.str.replace(", N", ",N")
  753. new_row = new_row.str.replace("- ", "-")
  754. data.loc[indices_to_merge[0]] = new_row
  755. data = data.drop(indices_to_merge[1:])
  756. return data