get_submissions_info.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. root_path = Path(__file__).parents[2].absolute()
  9. root_path = root_path.resolve()
  10. code_path = root_path / "code" / "UNFCCC_reader"
  11. # beware, folders below are different than for CRF reader
  12. downloaded_data_path = root_path / "downloaded_data"
  13. extracted_data_path = root_path / "extracted_data"
  14. legacy_data_path = root_path / "legacy_data"
  15. # TODO: move this to general util package
  16. custom_country_mapping = {
  17. "EUA": "European Union",
  18. "EUC": "European Union",
  19. "FRK": "France",
  20. "DKE": "Denmark",
  21. "DNM": "Denmark",
  22. "GBK": "United Kingdom of Great Britain and Northern Ireland",
  23. }
  24. custom_folders = {
  25. 'Venezeula_(Bolivarian_Republic_of)': 'VEN',
  26. 'Venezuela_(Bolivarian_Republic_of)': 'VEN',
  27. 'Micronesia_(Federated_State_of)': 'FSM',
  28. 'Micronesia_(Federated_States_of)': 'FSM',
  29. 'The_Republic_of_North_Macedonia': 'MKD',
  30. 'Republic_of_Korea': 'KOR',
  31. 'Bolivia_(Plurinational_State_of)': 'BOL',
  32. 'Türkiye': 'TUR',
  33. 'Iran_(Islamic_Republic_of)': 'IRN',
  34. 'Côte_d’Ivoire': 'CIV',
  35. 'Democratic_Republic_of_the_Congo': "COD",
  36. 'European_Union': 'EUA',
  37. 'Taiwan': 'TWN',
  38. }
  39. def get_country_submissions(
  40. country_name: str,
  41. print_sub: bool = True,
  42. ) -> Dict[str, List[str]]:
  43. """
  44. Input is a three letter ISO code for a country, or the countries name.
  45. The function tries to map the country name to an ISO code and then
  46. queries the folder mapping files for folders.
  47. Parameters
  48. ----------
  49. country_name: str
  50. String containing the country name or ISO 3 letter code
  51. print_sub: bool
  52. If True information on submissions will be written to stdout
  53. Returns
  54. -------
  55. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  56. Each value is a list of folders
  57. """
  58. data_folder = downloaded_data_path
  59. country_code = get_country_code(country_name)
  60. if print_sub:
  61. print(f"Country name {country_name} maps to ISO code {country_code}")
  62. country_submissions = {}
  63. if print_sub:
  64. print(f"#" * 80)
  65. print(f"The following submissions are available for {country_name}")
  66. for item in data_folder.iterdir():
  67. if item.is_dir():
  68. if print_sub:
  69. print("")
  70. print("-" * 80)
  71. print(f"Data folder {item.name}")
  72. print("-" * 80)
  73. with open(item / "folder_mapping.json", "r") as mapping_file:
  74. folder_mapping = json.load(mapping_file)
  75. if country_code in folder_mapping:
  76. country_folders = folder_mapping[country_code]
  77. if isinstance(country_folders, str):
  78. # only one folder
  79. country_folders = [country_folders]
  80. submission_folders = []
  81. for country_folder in country_folders:
  82. current_folder = item / country_folder
  83. if print_sub:
  84. print(f"Submissions in folder {country_folder}:")
  85. for submission_folder in current_folder.iterdir():
  86. if submission_folder.is_dir():
  87. if print_sub:
  88. print(submission_folder.name)
  89. submission_folders.append(submission_folder.name)
  90. country_submissions[item.name] = submission_folders
  91. else:
  92. print(f"No submissions available for {country_name}.")
  93. return country_submissions
  94. def get_country_datasets(
  95. country_name: str,
  96. print_ds: bool = True,
  97. ) -> Dict[str, List[str]]:
  98. """
  99. Input is a three letter ISO code for a country, or the country's name.
  100. The function tries to map the country name to an ISO code and then
  101. checks the code and data folders for content on the country.
  102. Parameters
  103. ----------
  104. country_name: str
  105. String containing the country name or ISO 3 letter code
  106. print_ds: bool
  107. If True information on submissions will be written to stdout
  108. Returns
  109. -------
  110. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  111. Each value is a list of folders
  112. """
  113. data_folder = extracted_data_path
  114. data_folder_legacy = legacy_data_path
  115. # obtain country code
  116. country_code = get_country_code(country_name)
  117. if print_ds:
  118. print(f"Country name {country_name} maps to ISO code {country_code}")
  119. rep_data = {}
  120. # data
  121. if print_ds:
  122. print(f"#" * 80)
  123. print(f"The following datasets are available for {country_name}")
  124. for item in data_folder.iterdir():
  125. if item.is_dir():
  126. cleaned_datasets_current_folder = {}
  127. if print_ds:
  128. print("-" * 80)
  129. print(f"Data folder {item.name}")
  130. print("-" * 80)
  131. with open(item / "folder_mapping.json", "r") as mapping_file:
  132. folder_mapping = json.load(mapping_file)
  133. if country_code not in folder_mapping:
  134. if print_ds:
  135. print("No data available")
  136. print("")
  137. else:
  138. country_folder = folder_mapping[country_code]
  139. if not isinstance(country_folder, str):
  140. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  141. datasets_current_folder = {}
  142. current_folder = item / country_folder
  143. for data_file in current_folder.iterdir():
  144. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  145. if data_file.stem in datasets_current_folder:
  146. datasets_current_folder[data_file.stem].append(data_file.suffix)
  147. else:
  148. datasets_current_folder[data_file.stem] = [data_file.suffix]
  149. for dataset in datasets_current_folder:
  150. # process filename to get submission
  151. parts = dataset.split('_')
  152. if parts[0] != country_code:
  153. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  154. else:
  155. terminology = "_".join(parts[3 : ])
  156. key = f"{parts[1]} ({parts[2]}, {terminology})"
  157. data_info = ""
  158. if '.nc' in datasets_current_folder[dataset]:
  159. data_info = data_info + "NF (.nc), "
  160. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  161. data_info = data_info + "IF (.yaml + .csv), "
  162. elif '.csv' in datasets_current_folder[dataset]:
  163. data_info = data_info + "incomplete IF? (.csv), "
  164. elif '.yaml' in datasets_current_folder[dataset]:
  165. data_info = data_info + "incomplete IF (.yaml), "
  166. code_file = get_code_file(country_code, parts[1])
  167. if code_file:
  168. data_info = data_info + f"code: {code_file.name}"
  169. else:
  170. data_info = data_info + f"code: not found"
  171. cleaned_datasets_current_folder[key] = data_info
  172. if print_ds:
  173. if cleaned_datasets_current_folder:
  174. for country_ds in cleaned_datasets_current_folder:
  175. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  176. else:
  177. print("No data available")
  178. print("")
  179. rep_data[item.name] = cleaned_datasets_current_folder
  180. # legacy data
  181. if print_ds:
  182. print(f"#" * 80)
  183. print(f"The following legacy datasets are available for {country_name}")
  184. legacy_data = {}
  185. for item in data_folder_legacy.iterdir():
  186. if item.is_dir():
  187. cleaned_datasets_current_folder = {}
  188. if print_ds:
  189. print("-" * 80)
  190. print(f"Data folder {item.name}")
  191. print("-" * 80)
  192. with open(item / "folder_mapping.json", "r") as mapping_file:
  193. folder_mapping = json.load(mapping_file)
  194. if country_code not in folder_mapping:
  195. if print_ds:
  196. print("No data available")
  197. print("")
  198. else:
  199. country_folder = folder_mapping[country_code]
  200. if not isinstance(country_folder, str):
  201. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  202. datasets_current_folder = {}
  203. current_folder = item / country_folder
  204. for data_file in current_folder.iterdir():
  205. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  206. if data_file.stem in datasets_current_folder:
  207. datasets_current_folder[data_file.stem].append(data_file.suffix)
  208. else:
  209. datasets_current_folder[data_file.stem] = [data_file.suffix]
  210. for dataset in datasets_current_folder:
  211. # process filename to get submission
  212. parts = dataset.split('_')
  213. if parts[0] != country_code:
  214. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  215. else:
  216. terminology = "_".join(parts[3 : ])
  217. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  218. data_info = ""
  219. if '.nc' in datasets_current_folder[dataset]:
  220. data_info = data_info + "NF (.nc), "
  221. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  222. data_info = data_info + "IF (.yaml + .csv), "
  223. elif '.csv' in datasets_current_folder[dataset]:
  224. data_info = data_info + "incomplete IF? (.csv), "
  225. elif '.yaml' in datasets_current_folder[dataset]:
  226. data_info = data_info + "incomplete IF (.yaml), "
  227. cleaned_datasets_current_folder[key] = data_info
  228. if print_ds:
  229. if cleaned_datasets_current_folder:
  230. for country_ds in cleaned_datasets_current_folder:
  231. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  232. else:
  233. print("No data available")
  234. print("")
  235. legacy_data[item.name] = cleaned_datasets_current_folder
  236. all_data = {
  237. "rep_data": rep_data,
  238. "legacy_data": legacy_data,
  239. }
  240. return all_data
  241. def get_country_code(
  242. country_name: str,
  243. )->str:
  244. """
  245. obtain country code. If the input is a code it will be returned, if the input
  246. is not a three letter code a search will be performed
  247. Parameters
  248. __________
  249. country_name: str
  250. Country code or name to get the three-letter code for.
  251. """
  252. # First check if it's in the list of custom codes
  253. if country_name in custom_country_mapping:
  254. country_code = country_name
  255. else:
  256. try:
  257. # check if it's a 3 letter code
  258. country = pycountry.countries.get(alpha_3=country_name)
  259. country_code = country.alpha_3
  260. except:
  261. try:
  262. country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
  263. except:
  264. raise ValueError(f"Country name {country_name} can not be mapped to "
  265. f"any country code. Try using the ISO3 code directly.")
  266. if len(country) > 1:
  267. country_code = None
  268. for current_country in country:
  269. if current_country.name == country_name:
  270. country_code = current_country.alpha_3
  271. if country_code is None:
  272. raise ValueError(f"Country name {country_name} has {len(country)} "
  273. f"possible results for country codes.")
  274. country_code = country[0].alpha_3
  275. return country_code
  276. def get_possible_inputs(
  277. country_name: str,
  278. submission: str,
  279. print_info: bool = False,
  280. ) -> List[Path]:
  281. """
  282. For given country name and submission find the possible input files
  283. Parameters
  284. ----------
  285. country_name: str
  286. String containing the country name or ISO 3 letter code
  287. submission: str
  288. String of the submission
  289. print_info: bool = False
  290. If True print information on code found
  291. Returns
  292. -------
  293. returns a list pathlib Path objects for the input files
  294. """
  295. data_folder = downloaded_data_path
  296. # obtain country code
  297. country_code = get_country_code(country_name)
  298. if print_info:
  299. print(f"Country name {country_name} maps to ISO code {country_code}")
  300. input_files = []
  301. for item in data_folder.iterdir():
  302. if item.is_dir():
  303. with open(item / "folder_mapping.json", "r") as mapping_file:
  304. folder_mapping = json.load(mapping_file)
  305. if country_code in folder_mapping:
  306. country_folders = folder_mapping[country_code]
  307. if isinstance(country_folders, str):
  308. # only one folder
  309. country_folders = [country_folders]
  310. for country_folder in country_folders:
  311. input_folder = item / country_folder / submission
  312. if input_folder.exists():
  313. for filepath in input_folder.glob("*"):
  314. input_files.append(filepath.relative_to(root_path))
  315. if print_info:
  316. if input_files:
  317. print(f"Found possible input files:")
  318. for file in input_files:
  319. print(file)
  320. else:
  321. print(f"No input files found")
  322. return input_files
  323. def get_possible_outputs(
  324. country_name: str,
  325. submission: str,
  326. print_info: bool = False,
  327. )-> List[Path]:
  328. """
  329. For given country name and submission find the possible output files
  330. Parameters
  331. ----------
  332. country_name: str
  333. String containing the country name or ISO 3 letter code
  334. submission: str
  335. String of the submission
  336. print_info: bool = False
  337. If True print information on outputs found
  338. Returns
  339. -------
  340. returns a list pathlib Path objects for the input files
  341. """
  342. data_folder = extracted_data_path
  343. # obtain country code
  344. country_code = get_country_code(country_name)
  345. if print_info:
  346. print(f"Country name {country_name} maps to ISO code {country_code}")
  347. output_files = []
  348. for item in data_folder.iterdir():
  349. if item.is_dir():
  350. with open(item / "folder_mapping.json", "r") as mapping_file:
  351. folder_mapping = json.load(mapping_file)
  352. if country_code in folder_mapping:
  353. country_folder = folder_mapping[country_code]
  354. if not isinstance(country_folder, str):
  355. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  356. output_folder = item / country_folder
  357. if output_folder.exists():
  358. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  359. output_files.append(filepath.relative_to(root_path))
  360. if print_info:
  361. if output_files:
  362. print(f"Found possible output files:")
  363. for file in output_files:
  364. print(file)
  365. else:
  366. print(f"No output files found")
  367. return output_files
  368. def get_code_file(
  369. country_name: str,
  370. submission: str,
  371. print_info: bool = False,
  372. ) -> Path:
  373. """
  374. For given country name and submission find the script that creates the data
  375. Parameters
  376. ----------
  377. country_name: str
  378. String containing the country name or ISO 3 letter code
  379. submission: str
  380. String of the submission
  381. print_info: bool = False
  382. If True print information on code found
  383. Returns
  384. -------
  385. returns a pathlib Path object for the code file
  386. """
  387. code_file_path = None
  388. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  389. # so we return the path to that.
  390. if submission[0:3] == "CRF":
  391. return root_path / "UNFCCC_CRF_reader"
  392. # obtain country code
  393. country_code = get_country_code(country_name)
  394. if print_info:
  395. print(f"Country name {country_name} maps to ISO code {country_code}")
  396. with open(code_path / "folder_mapping.json", "r") as mapping_file:
  397. folder_mapping = json.load(mapping_file)
  398. if country_code not in folder_mapping:
  399. if print_info:
  400. print("No code available")
  401. print("")
  402. else:
  403. country_folder = code_path / folder_mapping[country_code]
  404. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  405. for file in country_folder.iterdir():
  406. if file.match(code_file_name_candidate):
  407. if code_file_path is not None:
  408. raise ValueError(f"Found multiple code candidates: "
  409. f"{code_file_path} and file.name. "
  410. f"Please use only one file with name "
  411. f"'read_ISO3_submission_XXX.YYY'.")
  412. else:
  413. if print_info:
  414. print(f"Found code file {file.relative_to(root_path)}")
  415. code_file_path = file
  416. if code_file_path is not None:
  417. return code_file_path.relative_to(root_path)
  418. else:
  419. return None
  420. def create_folder_mapping(
  421. folder: str,
  422. extracted: bool = False
  423. ) -> None:
  424. """
  425. Create a mapping from 3 letter ISO country codes to folders
  426. based on the subfolders of the given folder. The mapping is
  427. stored in 'folder_mapping.json' in the given folder. Folder
  428. must be given relative to the repository root
  429. Parameters
  430. ----------
  431. folder: str
  432. folder to create the mapping for
  433. extracted: bool = False
  434. If true treat the folder as extracted data, where we
  435. only have one folder per country and no typos in the
  436. names
  437. Returns
  438. -------
  439. Nothing
  440. """
  441. folder = root_path / folder
  442. folder_mapping = {}
  443. #if not extracted:
  444. known_folders = custom_folders
  445. #else:
  446. # known_folders = {}
  447. for item in folder.iterdir():
  448. if item.is_dir() and not item.match("__pycache__"):
  449. if item.name in known_folders:
  450. ISO3 = known_folders[item.name]
  451. else:
  452. try:
  453. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  454. if len(country) > 1:
  455. ISO3 = None
  456. for current_country in country:
  457. if current_country.name == item.name.replace("_", " "):
  458. ISO3 = current_country.alpha_3
  459. else:
  460. ISO3 = country[0].alpha_3
  461. except:
  462. ISO3 = None
  463. if ISO3 is None:
  464. print(f"No match for {item.name}")
  465. else:
  466. if ISO3 in folder_mapping.keys():
  467. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  468. else:
  469. folder_mapping[ISO3] = item.name
  470. with open(folder / "folder_mapping.json", "w") as mapping_file:
  471. json.dump(folder_mapping, mapping_file, indent=4)