get_submissions_info.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import countrynames
  7. #import os
  8. def get_country_submissions(
  9. country_name: str,
  10. print_sub: bool = True,
  11. ) -> Dict[str, List[str]]:
  12. """
  13. Input is a three letter ISO code for a country, or the countries name.
  14. The function tries to map the country name to an ISO code and then
  15. queries the folder mapping files for folders.
  16. Parameters
  17. ----------
  18. country_name: str
  19. String containing the country name or ISO 3 letter code
  20. print_sub: bool
  21. If True information on submissions will be written to stdout
  22. Returns
  23. -------
  24. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  25. Each value is a list of folders
  26. """
  27. codepath = Path(__file__).parent
  28. data_folder = codepath / ".." / ".." / "downloaded_data"
  29. # obtain country code
  30. country_code = countrynames.to_code_3(country_name)
  31. if country_code is None:
  32. raise ValueError(f"Country name {country_name} can not be mapped to "
  33. f"any country code")
  34. if print_sub:
  35. print(f"Country name {country_name} maps to ISO code {country_code}")
  36. country_submissions = {}
  37. for item in data_folder.iterdir():
  38. if item.is_dir():
  39. if print_sub:
  40. print("")
  41. print("#" * 80)
  42. print(f"Data folder {item.name}")
  43. with open(item / "folder_mapping.json", "r") as mapping_file:
  44. folder_mapping = json.load(mapping_file)
  45. country_folders = folder_mapping[country_code]
  46. if isinstance(country_folders, str):
  47. # only one folder
  48. country_folders = [country_folders]
  49. submission_folders = []
  50. for country_folder in country_folders:
  51. current_folder = item / country_folder
  52. if print_sub:
  53. print("-" * 80)
  54. print(f"Submissions in folder {country_folder}:")
  55. for submission_folder in current_folder.iterdir():
  56. if submission_folder.is_dir():
  57. if print_sub:
  58. print(submission_folder.name)
  59. submission_folders.append(submission_folder.name)
  60. country_submissions[item.name] = submission_folders
  61. return country_submissions
  62. def get_country_datasets(
  63. country_name: str,
  64. print_ds: bool = True,
  65. ) -> Dict[str, List[str]]:
  66. """
  67. Input is a three letter ISO code for a country, or the country's name.
  68. The function tries to map the country name to an ISO code and then
  69. checks the code and data folders for content on the country.
  70. Parameters
  71. ----------
  72. country_name: str
  73. String containing the country name or ISO 3 letter code
  74. print_ds: bool
  75. If True information on submissions will be written to stdout
  76. Returns
  77. -------
  78. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  79. Each value is a list of folders
  80. """
  81. codepath = Path(__file__).parent
  82. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  83. rootpath = codepath / ".." / ".."
  84. rootpath = rootpath.resolve()
  85. data_folder = rootpath / "extracted_data"
  86. data_folder_legacy = rootpath / "legacy_data"
  87. # obtain country code
  88. country_code = countrynames.to_code_3(country_name)
  89. if country_code is None:
  90. raise ValueError(f"Country name {country_name} can not be mapped to "
  91. f"any country code")
  92. if print_ds:
  93. print(f"Country name {country_name} maps to ISO code {country_code}")
  94. rep_data = {}
  95. # data
  96. if print_ds:
  97. print(f"#" * 80)
  98. print(f"The following datasets are available for {country_name}")
  99. for item in data_folder.iterdir():
  100. if item.is_dir():
  101. cleaned_datasets_current_folder = {}
  102. if print_ds:
  103. print("-" * 80)
  104. print(f"Data folder {item.name}")
  105. print("-" * 80)
  106. with open(item / "folder_mapping.json", "r") as mapping_file:
  107. folder_mapping = json.load(mapping_file)
  108. if country_code not in folder_mapping:
  109. if print_ds:
  110. print("No data available")
  111. print("")
  112. else:
  113. country_folder = folder_mapping[country_code]
  114. if not isinstance(country_folder, str):
  115. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  116. datasets_current_folder = {}
  117. current_folder = item / country_folder
  118. for data_file in current_folder.iterdir():
  119. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  120. if data_file.stem in datasets_current_folder:
  121. datasets_current_folder[data_file.stem].append(data_file.suffix)
  122. else:
  123. datasets_current_folder[data_file.stem] = [data_file.suffix]
  124. for dataset in datasets_current_folder:
  125. # process filename to get submission
  126. parts = dataset.split('_')
  127. if parts[0] != country_code:
  128. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  129. else:
  130. terminology = "_".join(parts[3 : ])
  131. key = f"{parts[1]} ({parts[2]}, {terminology})"
  132. data_info = ""
  133. if '.nc' in datasets_current_folder[dataset]:
  134. data_info = data_info + "NF (.nc), "
  135. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  136. data_info = data_info + "IF (.yaml + .csv), "
  137. elif '.csv' in datasets_current_folder[dataset]:
  138. data_info = data_info + "incomplete IF? (.csv), "
  139. elif '.yaml' in datasets_current_folder[dataset]:
  140. data_info = data_info + "incomplete IF (.yaml), "
  141. code_file = get_code_file(country_code, parts[1])
  142. if code_file:
  143. data_info = data_info + f"code: {code_file.name}"
  144. else:
  145. data_info = data_info + f"code: not found"
  146. cleaned_datasets_current_folder[key] = data_info
  147. if print_ds:
  148. if cleaned_datasets_current_folder:
  149. for country_ds in cleaned_datasets_current_folder:
  150. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  151. else:
  152. print("No data available")
  153. print("")
  154. rep_data[item.name] = cleaned_datasets_current_folder
  155. # legacy data
  156. if print_ds:
  157. print(f"#" * 80)
  158. print(f"The following legacy datasets are available for {country_name}")
  159. legacy_data = {}
  160. for item in data_folder_legacy.iterdir():
  161. if item.is_dir():
  162. cleaned_datasets_current_folder = {}
  163. if print_ds:
  164. print("-" * 80)
  165. print(f"Data folder {item.name}")
  166. print("-" * 80)
  167. with open(item / "folder_mapping.json", "r") as mapping_file:
  168. folder_mapping = json.load(mapping_file)
  169. if country_code not in folder_mapping:
  170. if print_ds:
  171. print("No data available")
  172. print("")
  173. else:
  174. country_folder = folder_mapping[country_code]
  175. if not isinstance(country_folder, str):
  176. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  177. datasets_current_folder = {}
  178. current_folder = item / country_folder
  179. for data_file in current_folder.iterdir():
  180. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  181. if data_file.stem in datasets_current_folder:
  182. datasets_current_folder[data_file.stem].append(data_file.suffix)
  183. else:
  184. datasets_current_folder[data_file.stem] = [data_file.suffix]
  185. for dataset in datasets_current_folder:
  186. # process filename to get submission
  187. parts = dataset.split('_')
  188. if parts[0] != country_code:
  189. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  190. else:
  191. terminology = "_".join(parts[3 : ])
  192. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  193. data_info = ""
  194. if '.nc' in datasets_current_folder[dataset]:
  195. data_info = data_info + "NF (.nc), "
  196. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  197. data_info = data_info + "IF (.yaml + .csv), "
  198. elif '.csv' in datasets_current_folder[dataset]:
  199. data_info = data_info + "incomplete IF? (.csv), "
  200. elif '.yaml' in datasets_current_folder[dataset]:
  201. data_info = data_info + "incomplete IF (.yaml), "
  202. cleaned_datasets_current_folder[key] = data_info
  203. if print_ds:
  204. if cleaned_datasets_current_folder:
  205. for country_ds in cleaned_datasets_current_folder:
  206. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  207. else:
  208. print("No data available")
  209. print("")
  210. legacy_data[item.name] = cleaned_datasets_current_folder
  211. all_data = {
  212. "rep_data": rep_data,
  213. "legacy_data": legacy_data,
  214. }
  215. return all_data
  216. def get_possible_inputs(
  217. country_name: str,
  218. submission: str,
  219. print_info: bool = False,
  220. ) -> List[Path]:
  221. """
  222. For given country name and submission find the possible input files
  223. Parameters
  224. ----------
  225. country_name: str
  226. String containing the country name or ISO 3 letter code
  227. submission: str
  228. String of the submission
  229. print_info: bool = False
  230. If True print information on code found
  231. Returns
  232. -------
  233. returns a list pathlib Path objects for the input files
  234. """
  235. codepath = Path(__file__).parent
  236. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  237. rootpath = codepath / ".." / ".."
  238. rootpath = rootpath.resolve()
  239. data_folder = rootpath / "downloaded_data"
  240. # obtain country code
  241. country_code = countrynames.to_code_3(country_name)
  242. if country_code is None:
  243. raise ValueError(f"Country name {country_name} can not be mapped to "
  244. f"any country code")
  245. if print_info:
  246. print(f"Country name {country_name} maps to ISO code {country_code}")
  247. input_files = []
  248. for item in data_folder.iterdir():
  249. if item.is_dir():
  250. with open(item / "folder_mapping.json", "r") as mapping_file:
  251. folder_mapping = json.load(mapping_file)
  252. if country_code in folder_mapping:
  253. country_folders = folder_mapping[country_code]
  254. if isinstance(country_folders, str):
  255. # only one folder
  256. country_folders = [country_folders]
  257. for country_folder in country_folders:
  258. input_folder = item / country_folder / submission
  259. if input_folder.exists():
  260. for filepath in input_folder.glob("*"):
  261. input_files.append(filepath.relative_to(rootpath))
  262. if print_info:
  263. if input_files:
  264. print(f"Found possible input files:")
  265. for file in input_files:
  266. print(file)
  267. else:
  268. print(f"No input files found")
  269. return input_files
  270. def get_possible_outputs(
  271. country_name: str,
  272. submission: str,
  273. print_info: bool = False,
  274. )-> List[Path]:
  275. """
  276. For given country name and submission find the possible output files
  277. Parameters
  278. ----------
  279. country_name: str
  280. String containing the country name or ISO 3 letter code
  281. submission: str
  282. String of the submission
  283. print_info: bool = False
  284. If True print information on outputs found
  285. Returns
  286. -------
  287. returns a list pathlib Path objects for the input files
  288. """
  289. codepath = Path(__file__).parent
  290. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  291. rootpath = codepath / ".." / ".."
  292. rootpath = rootpath.resolve()
  293. data_folder = rootpath / "extracted_data"
  294. # obtain country code
  295. country_code = countrynames.to_code_3(country_name)
  296. if country_code is None:
  297. raise ValueError(f"Country name {country_name} can not be mapped to "
  298. f"any country code")
  299. if print_info:
  300. print(f"Country name {country_name} maps to ISO code {country_code}")
  301. output_files = []
  302. for item in data_folder.iterdir():
  303. if item.is_dir():
  304. with open(item / "folder_mapping.json", "r") as mapping_file:
  305. folder_mapping = json.load(mapping_file)
  306. if country_code in folder_mapping:
  307. country_folder = folder_mapping[country_code]
  308. if not isinstance(country_folder, str):
  309. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  310. output_folder = item / country_folder
  311. if output_folder.exists():
  312. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  313. output_files.append(filepath.relative_to(rootpath))
  314. if print_info:
  315. if output_files:
  316. print(f"Found possible output files:")
  317. for file in output_files:
  318. print(file)
  319. else:
  320. print(f"No output files found")
  321. return output_files
  322. def get_code_file(
  323. country_name: str,
  324. submission: str,
  325. print_info: bool = False,
  326. ) -> Path:
  327. """
  328. For given country name and submission find the script that creates the data
  329. Parameters
  330. ----------
  331. country_name: str
  332. String containing the country name or ISO 3 letter code
  333. submission: str
  334. String of the submission
  335. print_info: bool = False
  336. If True print information on code found
  337. Returns
  338. -------
  339. returns a pathlib Path object for the code file
  340. """
  341. codepath = Path(__file__).parent
  342. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  343. rootpath = codepath / ".." / ".."
  344. rootpath = rootpath.resolve()
  345. code_file_path = None
  346. # obtain country code
  347. country_code = countrynames.to_code_3(country_name)
  348. if country_code is None:
  349. raise ValueError(f"Country name {country_name} can not be mapped to "
  350. f"any country code")
  351. if print_info:
  352. print(f"Country name {country_name} maps to ISO code {country_code}")
  353. with open(codepath / "folder_mapping.json", "r") as mapping_file:
  354. folder_mapping = json.load(mapping_file)
  355. if country_code not in folder_mapping:
  356. if print_info:
  357. print("No code available")
  358. print("")
  359. else:
  360. country_folder = codepath / folder_mapping[country_code]
  361. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  362. for file in country_folder.iterdir():
  363. if file.match(code_file_name_candidate):
  364. if code_file_path is not None:
  365. raise ValueError(f"Found multiple code candidates: "
  366. f"{code_file_path} and file.name. "
  367. f"Please use only one file with name "
  368. f"'read_ISO3_submission_XXX.YYY'.")
  369. else:
  370. if print_info:
  371. print(f"Found code file {file.relative_to(rootpath)}")
  372. code_file_path = file
  373. return code_file_path.relative_to(rootpath)
  374. def create_folder_mapping(
  375. folder: str,
  376. extracted: bool = False
  377. ) -> None:
  378. """
  379. Create a mapping from 3 letter ISO country codes to folders
  380. based on the subfolders of the given folder. The mapping is
  381. stored in 'folder_mapping.json' in the given folder.
  382. Parameters
  383. ----------
  384. folder: str
  385. folder to create the mapping for
  386. extracted: bool = False
  387. If true treat the folder as extracted data, where we
  388. only have one folder per country and no typos in the
  389. names
  390. Returns
  391. -------
  392. Nothing
  393. """
  394. if extracted:
  395. folder_mapping = {}
  396. else:
  397. folder_mapping = {
  398. 'VEN': 'Venezeula_(Bolivarian_Republic_of)',
  399. 'FSM': 'Micronesia_(Federated_State_of)',
  400. 'MKD': 'The_Republic_of_North_Macedonia',
  401. }
  402. known_folders = list(folder_mapping.values())
  403. for item in folder.iterdir():
  404. if item.is_dir():
  405. ISO3 = countrynames.to_code_3(item.name)
  406. if ISO3 is None:
  407. if item.name not in known_folders:
  408. print(folder_mapping.values())
  409. print(f"No match for {item.name}")
  410. else:
  411. known_folders.append(item.name)
  412. if ISO3 in folder_mapping.keys():
  413. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  414. else:
  415. folder_mapping[ISO3] = item.name
  416. with open(folder / "folder_mapping.json", "w") as mapping_file:
  417. json.dump(folder_mapping, mapping_file, indent=4)