get_submissions_info.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. root_path = Path(__file__).parents[2].absolute()
  9. root_path = root_path.resolve()
  10. code_path = root_path / "UNFCCC_GHG_data" / "UNFCCC_reader"
  11. # beware, folders below are different than for CRF reader
  12. downloaded_data_path = root_path / "downloaded_data"
  13. extracted_data_path = root_path / "extracted_data"
  14. legacy_data_path = root_path / "legacy_data"
  15. # TODO: move this to general util package
  16. custom_country_mapping = {
  17. "EUA": "European Union",
  18. "EUC": "European Union",
  19. "FRK": "France",
  20. "DKE": "Denmark",
  21. "DNM": "Denmark",
  22. "GBK": "United Kingdom of Great Britain and Northern Ireland",
  23. }
  24. custom_folders = {
  25. 'Venezeula_(Bolivarian_Republic_of)': 'VEN',
  26. 'Venezuela_(Bolivarian_Republic_of)': 'VEN',
  27. 'Micronesia_(Federated_State_of)': 'FSM',
  28. 'Micronesia_(Federated_States_of)': 'FSM',
  29. 'The_Republic_of_North_Macedonia': 'MKD',
  30. 'Republic_of_Korea': 'KOR',
  31. 'Bolivia_(Plurinational_State_of)': 'BOL',
  32. 'Türkiye': 'TUR',
  33. 'Iran_(Islamic_Republic_of)': 'IRN',
  34. 'Côte_d’Ivoire': 'CIV',
  35. 'Democratic_Republic_of_the_Congo': "COD",
  36. 'European_Union': 'EUA',
  37. 'Taiwan': 'TWN',
  38. }
  39. def get_country_submissions(
  40. country_name: str,
  41. print_sub: bool = True,
  42. ) -> Dict[str, List[str]]:
  43. """
  44. Input is a three letter ISO UNFCCC_GHG_data for a country, or the countries name.
  45. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  46. queries the folder mapping files for folders.
  47. Parameters
  48. ----------
  49. country_name: str
  50. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  51. print_sub: bool
  52. If True information on submissions will be written to stdout
  53. Returns
  54. -------
  55. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  56. Each value is a list of folders
  57. """
  58. data_folder = downloaded_data_path
  59. country_code = get_country_code(country_name)
  60. if print_sub:
  61. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  62. country_submissions = {}
  63. if print_sub:
  64. print(f"#" * 80)
  65. print(f"The following submissions are available for {country_name}")
  66. for item in data_folder.iterdir():
  67. if item.is_dir():
  68. if print_sub:
  69. print("")
  70. print("-" * 80)
  71. print(f"Data folder {item.name}")
  72. print("-" * 80)
  73. with open(item / "folder_mapping.json", "r") as mapping_file:
  74. folder_mapping = json.load(mapping_file)
  75. if country_code in folder_mapping:
  76. country_folders = folder_mapping[country_code]
  77. if isinstance(country_folders, str):
  78. # only one folder
  79. country_folders = [country_folders]
  80. submission_folders = []
  81. for country_folder in country_folders:
  82. current_folder = item / country_folder
  83. if print_sub:
  84. print(f"Submissions in folder {country_folder}:")
  85. for submission_folder in current_folder.iterdir():
  86. if submission_folder.is_dir():
  87. if print_sub:
  88. print(submission_folder.name)
  89. submission_folders.append(submission_folder.name)
  90. country_submissions[item.name] = submission_folders
  91. else:
  92. print(f"No submissions available for {country_name}.")
  93. return country_submissions
  94. def get_country_datasets(
  95. country_name: str,
  96. print_ds: bool = True,
  97. ) -> Dict[str, List[str]]:
  98. """
  99. Input is a three letter ISO UNFCCC_GHG_data for a country, or the country's name.
  100. The function tries to map the country name to an ISO UNFCCC_GHG_data and then
  101. checks the UNFCCC_GHG_data and data folders for content on the country.
  102. Parameters
  103. ----------
  104. country_name: str
  105. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  106. print_ds: bool
  107. If True information on submissions will be written to stdout
  108. Returns
  109. -------
  110. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  111. Each value is a list of folders
  112. """
  113. data_folder = extracted_data_path
  114. data_folder_legacy = legacy_data_path
  115. # obtain country UNFCCC_GHG_data
  116. country_code = get_country_code(country_name)
  117. if print_ds:
  118. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  119. rep_data = {}
  120. # data
  121. if print_ds:
  122. print(f"#" * 80)
  123. print(f"The following datasets are available for {country_name}")
  124. for item in data_folder.iterdir():
  125. if item.is_dir():
  126. cleaned_datasets_current_folder = {}
  127. if print_ds:
  128. print("-" * 80)
  129. print(f"Data folder {item.name}")
  130. print("-" * 80)
  131. with open(item / "folder_mapping.json", "r") as mapping_file:
  132. folder_mapping = json.load(mapping_file)
  133. if country_code not in folder_mapping:
  134. if print_ds:
  135. print("No data available")
  136. print("")
  137. else:
  138. country_folder = folder_mapping[country_code]
  139. if not isinstance(country_folder, str):
  140. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  141. datasets_current_folder = {}
  142. current_folder = item / country_folder
  143. for data_file in current_folder.iterdir():
  144. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  145. if data_file.stem in datasets_current_folder:
  146. datasets_current_folder[data_file.stem].append(data_file.suffix)
  147. else:
  148. datasets_current_folder[data_file.stem] = [data_file.suffix]
  149. for dataset in datasets_current_folder:
  150. # process filename to get submission
  151. parts = dataset.split('_')
  152. if parts[0] != country_code:
  153. cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
  154. else:
  155. terminology = "_".join(parts[3 : ])
  156. key = f"{parts[1]} ({parts[2]}, {terminology})"
  157. data_info = ""
  158. if '.nc' in datasets_current_folder[dataset]:
  159. data_info = data_info + "NF (.nc), "
  160. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  161. data_info = data_info + "IF (.yaml + .csv), "
  162. elif '.csv' in datasets_current_folder[dataset]:
  163. data_info = data_info + "incomplete IF? (.csv), "
  164. elif '.yaml' in datasets_current_folder[dataset]:
  165. data_info = data_info + "incomplete IF (.yaml), "
  166. code_file = get_code_file(country_code, parts[1])
  167. if code_file:
  168. data_info = data_info + f"UNFCCC_GHG_data: {code_file.name}"
  169. else:
  170. data_info = data_info + f"UNFCCC_GHG_data: not found"
  171. cleaned_datasets_current_folder[key] = data_info
  172. if print_ds:
  173. if cleaned_datasets_current_folder:
  174. for country_ds in cleaned_datasets_current_folder:
  175. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  176. else:
  177. print("No data available")
  178. print("")
  179. rep_data[item.name] = cleaned_datasets_current_folder
  180. # legacy data
  181. if print_ds:
  182. print(f"#" * 80)
  183. print(f"The following legacy datasets are available for {country_name}")
  184. legacy_data = {}
  185. for item in data_folder_legacy.iterdir():
  186. if item.is_dir():
  187. cleaned_datasets_current_folder = {}
  188. if print_ds:
  189. print("-" * 80)
  190. print(f"Data folder {item.name}")
  191. print("-" * 80)
  192. with open(item / "folder_mapping.json", "r") as mapping_file:
  193. folder_mapping = json.load(mapping_file)
  194. if country_code not in folder_mapping:
  195. if print_ds:
  196. print("No data available")
  197. print("")
  198. else:
  199. country_folder = folder_mapping[country_code]
  200. if not isinstance(country_folder, str):
  201. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  202. datasets_current_folder = {}
  203. current_folder = item / country_folder
  204. for data_file in current_folder.iterdir():
  205. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  206. if data_file.stem in datasets_current_folder:
  207. datasets_current_folder[data_file.stem].append(data_file.suffix)
  208. else:
  209. datasets_current_folder[data_file.stem] = [data_file.suffix]
  210. for dataset in datasets_current_folder:
  211. # process filename to get submission
  212. parts = dataset.split('_')
  213. if parts[0] != country_code:
  214. cleaned_datasets_current_folder[f'Wrong UNFCCC_GHG_data: {parts[0]}'] = dataset
  215. else:
  216. terminology = "_".join(parts[3 : ])
  217. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  218. data_info = ""
  219. if '.nc' in datasets_current_folder[dataset]:
  220. data_info = data_info + "NF (.nc), "
  221. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  222. data_info = data_info + "IF (.yaml + .csv), "
  223. elif '.csv' in datasets_current_folder[dataset]:
  224. data_info = data_info + "incomplete IF? (.csv), "
  225. elif '.yaml' in datasets_current_folder[dataset]:
  226. data_info = data_info + "incomplete IF (.yaml), "
  227. cleaned_datasets_current_folder[key] = data_info
  228. if print_ds:
  229. if cleaned_datasets_current_folder:
  230. for country_ds in cleaned_datasets_current_folder:
  231. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  232. else:
  233. print("No data available")
  234. print("")
  235. legacy_data[item.name] = cleaned_datasets_current_folder
  236. all_data = {
  237. "rep_data": rep_data,
  238. "legacy_data": legacy_data,
  239. }
  240. return all_data
  241. def get_country_code(
  242. country_name: str,
  243. )->str:
  244. """
  245. obtain country UNFCCC_GHG_data. If the input is a UNFCCC_GHG_data it will be returned, if the input
  246. is not a three letter UNFCCC_GHG_data a search will be performed
  247. Parameters
  248. __________
  249. country_name: str
  250. Country UNFCCC_GHG_data or name to get the three-letter UNFCCC_GHG_data for.
  251. """
  252. # First check if it's in the list of custom codes
  253. if country_name in custom_country_mapping:
  254. country_code = country_name
  255. else:
  256. try:
  257. # check if it's a 3 letter UNFCCC_GHG_data
  258. country = pycountry.countries.get(alpha_3=country_name)
  259. country_code = country.alpha_3
  260. except:
  261. try:
  262. country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
  263. except:
  264. raise ValueError(f"Country name {country_name} can not be mapped to "
  265. f"any country UNFCCC_GHG_data. Try using the ISO3 UNFCCC_GHG_data directly.")
  266. if len(country) > 1:
  267. country_code = None
  268. for current_country in country:
  269. if current_country.name == country_name:
  270. country_code = current_country.alpha_3
  271. if country_code is None:
  272. raise ValueError(f"Country name {country_name} has {len(country)} "
  273. f"possible results for country codes.")
  274. country_code = country[0].alpha_3
  275. return country_code
  276. def get_possible_inputs(
  277. country_name: str,
  278. submission: str,
  279. print_info: bool = False,
  280. ) -> List[Path]:
  281. """
  282. For given country name and submission find the possible input files
  283. Parameters
  284. ----------
  285. country_name: str
  286. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  287. submission: str
  288. String of the submission
  289. print_info: bool = False
  290. If True print information on UNFCCC_GHG_data found
  291. Returns
  292. -------
  293. returns a list pathlib Path objects for the input files
  294. """
  295. data_folder = downloaded_data_path
  296. # obtain country UNFCCC_GHG_data
  297. country_code = get_country_code(country_name)
  298. if print_info:
  299. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  300. input_files = []
  301. for item in data_folder.iterdir():
  302. if item.is_dir():
  303. with open(item / "folder_mapping.json", "r") as mapping_file:
  304. folder_mapping = json.load(mapping_file)
  305. if country_code in folder_mapping:
  306. country_folders = folder_mapping[country_code]
  307. if isinstance(country_folders, str):
  308. # only one folder
  309. country_folders = [country_folders]
  310. for country_folder in country_folders:
  311. input_folder = item / country_folder / submission
  312. if input_folder.exists():
  313. for filepath in input_folder.glob("*"):
  314. input_files.append(filepath.relative_to(root_path))
  315. if print_info:
  316. if input_files:
  317. print(f"Found possible input files:")
  318. for file in input_files:
  319. print(file)
  320. else:
  321. print(f"No input files found")
  322. return input_files
  323. def get_possible_outputs(
  324. country_name: str,
  325. submission: str,
  326. print_info: bool = False,
  327. )-> List[Path]:
  328. """
  329. For given country name and submission find the possible output files
  330. Parameters
  331. ----------
  332. country_name: str
  333. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  334. submission: str
  335. String of the submission
  336. print_info: bool = False
  337. If True print information on outputs found
  338. Returns
  339. -------
  340. returns a list pathlib Path objects for the input files
  341. """
  342. data_folder = extracted_data_path
  343. # obtain country UNFCCC_GHG_data
  344. country_code = get_country_code(country_name)
  345. if print_info:
  346. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  347. output_files = []
  348. for item in data_folder.iterdir():
  349. if item.is_dir():
  350. with open(item / "folder_mapping.json", "r") as mapping_file:
  351. folder_mapping = json.load(mapping_file)
  352. if country_code in folder_mapping:
  353. country_folder = folder_mapping[country_code]
  354. if not isinstance(country_folder, str):
  355. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  356. output_folder = item / country_folder
  357. if output_folder.exists():
  358. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  359. output_files.append(filepath.relative_to(root_path))
  360. if print_info:
  361. if output_files:
  362. print(f"Found possible output files:")
  363. for file in output_files:
  364. print(file)
  365. else:
  366. print(f"No output files found")
  367. return output_files
  368. def get_code_file(
  369. country_name: str,
  370. submission: str,
  371. print_info: bool = False,
  372. ) -> Path:
  373. """
  374. For given country name and submission find the script that creates the data
  375. Parameters
  376. ----------
  377. country_name: str
  378. String containing the country name or ISO 3 letter UNFCCC_GHG_data
  379. submission: str
  380. String of the submission
  381. print_info: bool = False
  382. If True print information on UNFCCC_GHG_data found
  383. Returns
  384. -------
  385. returns a pathlib Path object for the UNFCCC_GHG_data file
  386. """
  387. code_file_path = None
  388. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  389. # so we return the path to that.
  390. if submission[0:3] == "CRF":
  391. return root_path / "UNFCCC_CRF_reader"
  392. # obtain country UNFCCC_GHG_data
  393. country_code = get_country_code(country_name)
  394. if print_info:
  395. print(f"Country name {country_name} maps to ISO UNFCCC_GHG_data {country_code}")
  396. with open(code_path / "folder_mapping.json", "r") as mapping_file:
  397. folder_mapping = json.load(mapping_file)
  398. if country_code not in folder_mapping:
  399. if print_info:
  400. print("No UNFCCC_GHG_data available")
  401. print("")
  402. else:
  403. country_folder = code_path / folder_mapping[country_code]
  404. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  405. for file in country_folder.iterdir():
  406. if file.match(code_file_name_candidate):
  407. if code_file_path is not None:
  408. raise ValueError(f"Found multiple UNFCCC_GHG_data candidates: "
  409. f"{code_file_path} and file.name. "
  410. f"Please use only one file with name "
  411. f"'read_ISO3_submission_XXX.YYY'.")
  412. else:
  413. if print_info:
  414. print(f"Found UNFCCC_GHG_data file {file.relative_to(root_path)}")
  415. code_file_path = file
  416. if code_file_path is not None:
  417. return code_file_path.relative_to(root_path)
  418. else:
  419. return None
  420. def create_folder_mapping(
  421. folder: str,
  422. extracted: bool = False
  423. ) -> None:
  424. """
  425. Create a mapping from 3 letter ISO country codes to folders
  426. based on the subfolders of the given folder. The mapping is
  427. stored in 'folder_mapping.json' in the given folder. Folder
  428. must be given relative to the repository root
  429. Parameters
  430. ----------
  431. folder: str
  432. folder to create the mapping for
  433. extracted: bool = False
  434. If true treat the folder as extracted data, where we
  435. only have one folder per country and no typos in the
  436. names
  437. Returns
  438. -------
  439. Nothing
  440. """
  441. folder = root_path / folder
  442. folder_mapping = {}
  443. #if not extracted:
  444. known_folders = custom_folders
  445. #else:
  446. # known_folders = {}
  447. for item in folder.iterdir():
  448. if item.is_dir() and not item.match("__pycache__"):
  449. if item.name in known_folders:
  450. ISO3 = known_folders[item.name]
  451. else:
  452. try:
  453. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  454. if len(country) > 1:
  455. ISO3 = None
  456. for current_country in country:
  457. if current_country.name == item.name.replace("_", " "):
  458. ISO3 = current_country.alpha_3
  459. else:
  460. ISO3 = country[0].alpha_3
  461. except:
  462. ISO3 = None
  463. if ISO3 is None:
  464. print(f"No match for {item.name}")
  465. else:
  466. if ISO3 in folder_mapping.keys():
  467. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  468. else:
  469. folder_mapping[ISO3] = item.name
  470. with open(folder / "folder_mapping.json", "w") as mapping_file:
  471. json.dump(folder_mapping, mapping_file, indent=4)