get_submissions_info.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import countrynames
  7. #import os
  8. def get_country_submissions(
  9. country_name: str,
  10. print_sub: bool = True,
  11. ) -> Dict[str, List[str]]:
  12. """
  13. Input is a three letter ISO code for a country, or the countries name.
  14. The function tries to map the country name to an ISO code and then
  15. queries the folder mapping files for folders.
  16. Parameters
  17. ----------
  18. country_name: str
  19. String containing the country name or ISO 3 letter code
  20. print_sub: bool
  21. If True information on submissions will be written to stdout
  22. Returns
  23. -------
  24. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  25. Each value is a list of folders
  26. """
  27. codepath = Path(__file__).parent
  28. data_folder = codepath / ".." / ".." / "downloaded_data"
  29. # obtain country code
  30. country_code = countrynames.to_code_3(country_name)
  31. if country_code is None:
  32. raise ValueError(f"Country name {country_name} can not be mapped to "
  33. f"any country code")
  34. if print_sub:
  35. print(f"Country name {country_name} maps to ISO code {country_code}")
  36. country_submissions = {}
  37. if print_sub:
  38. print(f"#" * 80)
  39. print(f"The following submissions are available for {country_name}")
  40. for item in data_folder.iterdir():
  41. if item.is_dir():
  42. if print_sub:
  43. print("")
  44. print("-" * 80)
  45. print(f"Data folder {item.name}")
  46. print("-" * 80)
  47. with open(item / "folder_mapping.json", "r") as mapping_file:
  48. folder_mapping = json.load(mapping_file)
  49. if country_code in folder_mapping:
  50. country_folders = folder_mapping[country_code]
  51. if isinstance(country_folders, str):
  52. # only one folder
  53. country_folders = [country_folders]
  54. submission_folders = []
  55. for country_folder in country_folders:
  56. current_folder = item / country_folder
  57. if print_sub:
  58. print(f"Submissions in folder {country_folder}:")
  59. for submission_folder in current_folder.iterdir():
  60. if submission_folder.is_dir():
  61. if print_sub:
  62. print(submission_folder.name)
  63. submission_folders.append(submission_folder.name)
  64. country_submissions[item.name] = submission_folders
  65. else:
  66. print(f"No submissions available for {country_name}.")
  67. return country_submissions
  68. def get_country_datasets(
  69. country_name: str,
  70. print_ds: bool = True,
  71. ) -> Dict[str, List[str]]:
  72. """
  73. Input is a three letter ISO code for a country, or the country's name.
  74. The function tries to map the country name to an ISO code and then
  75. checks the code and data folders for content on the country.
  76. Parameters
  77. ----------
  78. country_name: str
  79. String containing the country name or ISO 3 letter code
  80. print_ds: bool
  81. If True information on submissions will be written to stdout
  82. Returns
  83. -------
  84. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  85. Each value is a list of folders
  86. """
  87. codepath = Path(__file__).parent
  88. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  89. rootpath = codepath / ".." / ".."
  90. rootpath = rootpath.resolve()
  91. data_folder = rootpath / "extracted_data"
  92. data_folder_legacy = rootpath / "legacy_data"
  93. # obtain country code
  94. country_code = countrynames.to_code_3(country_name)
  95. if country_code is None:
  96. raise ValueError(f"Country name {country_name} can not be mapped to "
  97. f"any country code")
  98. if print_ds:
  99. print(f"Country name {country_name} maps to ISO code {country_code}")
  100. rep_data = {}
  101. # data
  102. if print_ds:
  103. print(f"#" * 80)
  104. print(f"The following datasets are available for {country_name}")
  105. for item in data_folder.iterdir():
  106. if item.is_dir():
  107. cleaned_datasets_current_folder = {}
  108. if print_ds:
  109. print("-" * 80)
  110. print(f"Data folder {item.name}")
  111. print("-" * 80)
  112. with open(item / "folder_mapping.json", "r") as mapping_file:
  113. folder_mapping = json.load(mapping_file)
  114. if country_code not in folder_mapping:
  115. if print_ds:
  116. print("No data available")
  117. print("")
  118. else:
  119. country_folder = folder_mapping[country_code]
  120. if not isinstance(country_folder, str):
  121. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  122. datasets_current_folder = {}
  123. current_folder = item / country_folder
  124. for data_file in current_folder.iterdir():
  125. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  126. if data_file.stem in datasets_current_folder:
  127. datasets_current_folder[data_file.stem].append(data_file.suffix)
  128. else:
  129. datasets_current_folder[data_file.stem] = [data_file.suffix]
  130. for dataset in datasets_current_folder:
  131. # process filename to get submission
  132. parts = dataset.split('_')
  133. if parts[0] != country_code:
  134. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  135. else:
  136. terminology = "_".join(parts[3 : ])
  137. key = f"{parts[1]} ({parts[2]}, {terminology})"
  138. data_info = ""
  139. if '.nc' in datasets_current_folder[dataset]:
  140. data_info = data_info + "NF (.nc), "
  141. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  142. data_info = data_info + "IF (.yaml + .csv), "
  143. elif '.csv' in datasets_current_folder[dataset]:
  144. data_info = data_info + "incomplete IF? (.csv), "
  145. elif '.yaml' in datasets_current_folder[dataset]:
  146. data_info = data_info + "incomplete IF (.yaml), "
  147. code_file = get_code_file(country_code, parts[1])
  148. if code_file:
  149. data_info = data_info + f"code: {code_file.name}"
  150. else:
  151. data_info = data_info + f"code: not found"
  152. cleaned_datasets_current_folder[key] = data_info
  153. if print_ds:
  154. if cleaned_datasets_current_folder:
  155. for country_ds in cleaned_datasets_current_folder:
  156. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  157. else:
  158. print("No data available")
  159. print("")
  160. rep_data[item.name] = cleaned_datasets_current_folder
  161. # legacy data
  162. if print_ds:
  163. print(f"#" * 80)
  164. print(f"The following legacy datasets are available for {country_name}")
  165. legacy_data = {}
  166. for item in data_folder_legacy.iterdir():
  167. if item.is_dir():
  168. cleaned_datasets_current_folder = {}
  169. if print_ds:
  170. print("-" * 80)
  171. print(f"Data folder {item.name}")
  172. print("-" * 80)
  173. with open(item / "folder_mapping.json", "r") as mapping_file:
  174. folder_mapping = json.load(mapping_file)
  175. if country_code not in folder_mapping:
  176. if print_ds:
  177. print("No data available")
  178. print("")
  179. else:
  180. country_folder = folder_mapping[country_code]
  181. if not isinstance(country_folder, str):
  182. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  183. datasets_current_folder = {}
  184. current_folder = item / country_folder
  185. for data_file in current_folder.iterdir():
  186. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  187. if data_file.stem in datasets_current_folder:
  188. datasets_current_folder[data_file.stem].append(data_file.suffix)
  189. else:
  190. datasets_current_folder[data_file.stem] = [data_file.suffix]
  191. for dataset in datasets_current_folder:
  192. # process filename to get submission
  193. parts = dataset.split('_')
  194. if parts[0] != country_code:
  195. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  196. else:
  197. terminology = "_".join(parts[3 : ])
  198. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  199. data_info = ""
  200. if '.nc' in datasets_current_folder[dataset]:
  201. data_info = data_info + "NF (.nc), "
  202. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  203. data_info = data_info + "IF (.yaml + .csv), "
  204. elif '.csv' in datasets_current_folder[dataset]:
  205. data_info = data_info + "incomplete IF? (.csv), "
  206. elif '.yaml' in datasets_current_folder[dataset]:
  207. data_info = data_info + "incomplete IF (.yaml), "
  208. cleaned_datasets_current_folder[key] = data_info
  209. if print_ds:
  210. if cleaned_datasets_current_folder:
  211. for country_ds in cleaned_datasets_current_folder:
  212. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  213. else:
  214. print("No data available")
  215. print("")
  216. legacy_data[item.name] = cleaned_datasets_current_folder
  217. all_data = {
  218. "rep_data": rep_data,
  219. "legacy_data": legacy_data,
  220. }
  221. return all_data
  222. def get_possible_inputs(
  223. country_name: str,
  224. submission: str,
  225. print_info: bool = False,
  226. ) -> List[Path]:
  227. """
  228. For given country name and submission find the possible input files
  229. Parameters
  230. ----------
  231. country_name: str
  232. String containing the country name or ISO 3 letter code
  233. submission: str
  234. String of the submission
  235. print_info: bool = False
  236. If True print information on code found
  237. Returns
  238. -------
  239. returns a list pathlib Path objects for the input files
  240. """
  241. codepath = Path(__file__).parent
  242. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  243. rootpath = codepath / ".." / ".."
  244. rootpath = rootpath.resolve()
  245. data_folder = rootpath / "downloaded_data"
  246. # obtain country code
  247. country_code = countrynames.to_code_3(country_name)
  248. if country_code is None:
  249. raise ValueError(f"Country name {country_name} can not be mapped to "
  250. f"any country code")
  251. if print_info:
  252. print(f"Country name {country_name} maps to ISO code {country_code}")
  253. input_files = []
  254. for item in data_folder.iterdir():
  255. if item.is_dir():
  256. with open(item / "folder_mapping.json", "r") as mapping_file:
  257. folder_mapping = json.load(mapping_file)
  258. if country_code in folder_mapping:
  259. country_folders = folder_mapping[country_code]
  260. if isinstance(country_folders, str):
  261. # only one folder
  262. country_folders = [country_folders]
  263. for country_folder in country_folders:
  264. input_folder = item / country_folder / submission
  265. if input_folder.exists():
  266. for filepath in input_folder.glob("*"):
  267. input_files.append(filepath.relative_to(rootpath))
  268. if print_info:
  269. if input_files:
  270. print(f"Found possible input files:")
  271. for file in input_files:
  272. print(file)
  273. else:
  274. print(f"No input files found")
  275. return input_files
  276. def get_possible_outputs(
  277. country_name: str,
  278. submission: str,
  279. print_info: bool = False,
  280. )-> List[Path]:
  281. """
  282. For given country name and submission find the possible output files
  283. Parameters
  284. ----------
  285. country_name: str
  286. String containing the country name or ISO 3 letter code
  287. submission: str
  288. String of the submission
  289. print_info: bool = False
  290. If True print information on outputs found
  291. Returns
  292. -------
  293. returns a list pathlib Path objects for the input files
  294. """
  295. codepath = Path(__file__).parent
  296. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  297. rootpath = codepath / ".." / ".."
  298. rootpath = rootpath.resolve()
  299. data_folder = rootpath / "extracted_data"
  300. # obtain country code
  301. country_code = countrynames.to_code_3(country_name)
  302. if country_code is None:
  303. raise ValueError(f"Country name {country_name} can not be mapped to "
  304. f"any country code")
  305. if print_info:
  306. print(f"Country name {country_name} maps to ISO code {country_code}")
  307. output_files = []
  308. for item in data_folder.iterdir():
  309. if item.is_dir():
  310. with open(item / "folder_mapping.json", "r") as mapping_file:
  311. folder_mapping = json.load(mapping_file)
  312. if country_code in folder_mapping:
  313. country_folder = folder_mapping[country_code]
  314. if not isinstance(country_folder, str):
  315. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  316. output_folder = item / country_folder
  317. if output_folder.exists():
  318. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  319. output_files.append(filepath.relative_to(rootpath))
  320. if print_info:
  321. if output_files:
  322. print(f"Found possible output files:")
  323. for file in output_files:
  324. print(file)
  325. else:
  326. print(f"No output files found")
  327. return output_files
  328. def get_code_file(
  329. country_name: str,
  330. submission: str,
  331. print_info: bool = False,
  332. ) -> Path:
  333. """
  334. For given country name and submission find the script that creates the data
  335. Parameters
  336. ----------
  337. country_name: str
  338. String containing the country name or ISO 3 letter code
  339. submission: str
  340. String of the submission
  341. print_info: bool = False
  342. If True print information on code found
  343. Returns
  344. -------
  345. returns a pathlib Path object for the code file
  346. """
  347. codepath = Path(__file__).parent
  348. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  349. rootpath = codepath / ".." / ".."
  350. rootpath = rootpath.resolve()
  351. code_file_path = None
  352. # obtain country code
  353. country_code = countrynames.to_code_3(country_name)
  354. if country_code is None:
  355. raise ValueError(f"Country name {country_name} can not be mapped to "
  356. f"any country code")
  357. if print_info:
  358. print(f"Country name {country_name} maps to ISO code {country_code}")
  359. with open(codepath / "folder_mapping.json", "r") as mapping_file:
  360. folder_mapping = json.load(mapping_file)
  361. if country_code not in folder_mapping:
  362. if print_info:
  363. print("No code available")
  364. print("")
  365. else:
  366. country_folder = codepath / folder_mapping[country_code]
  367. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  368. for file in country_folder.iterdir():
  369. if file.match(code_file_name_candidate):
  370. if code_file_path is not None:
  371. raise ValueError(f"Found multiple code candidates: "
  372. f"{code_file_path} and file.name. "
  373. f"Please use only one file with name "
  374. f"'read_ISO3_submission_XXX.YYY'.")
  375. else:
  376. if print_info:
  377. print(f"Found code file {file.relative_to(rootpath)}")
  378. code_file_path = file
  379. if code_file_path is not None:
  380. return code_file_path.relative_to(rootpath)
  381. else:
  382. return None
  383. def create_folder_mapping(
  384. folder: str,
  385. extracted: bool = False
  386. ) -> None:
  387. """
  388. Create a mapping from 3 letter ISO country codes to folders
  389. based on the subfolders of the given folder. The mapping is
  390. stored in 'folder_mapping.json' in the given folder. Folder
  391. must be given relative to the repository root
  392. Parameters
  393. ----------
  394. folder: str
  395. folder to create the mapping for
  396. extracted: bool = False
  397. If true treat the folder as extracted data, where we
  398. only have one folder per country and no typos in the
  399. names
  400. Returns
  401. -------
  402. Nothing
  403. """
  404. codepath = Path(__file__).parent
  405. rootpath = codepath / ".." / ".."
  406. rootpath = rootpath.resolve()
  407. folder = rootpath / folder
  408. if extracted:
  409. folder_mapping = {}
  410. else:
  411. folder_mapping = {
  412. 'VEN': 'Venezeula_(Bolivarian_Republic_of)',
  413. 'FSM': 'Micronesia_(Federated_State_of)',
  414. 'MKD': 'The_Republic_of_North_Macedonia',
  415. }
  416. known_folders = list(folder_mapping.values())
  417. for item in folder.iterdir():
  418. if item.is_dir():
  419. ISO3 = countrynames.to_code_3(item.name)
  420. if ISO3 is None:
  421. if item.name not in known_folders:
  422. print(folder_mapping.values())
  423. print(f"No match for {item.name}")
  424. else:
  425. known_folders.append(item.name)
  426. if ISO3 in folder_mapping.keys():
  427. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  428. else:
  429. folder_mapping[ISO3] = item.name
  430. with open(folder / "folder_mapping.json", "w") as mapping_file:
  431. json.dump(folder_mapping, mapping_file, indent=4)