get_submissions_info.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. def get_country_submissions(
  9. country_name: str,
  10. print_sub: bool = True,
  11. ) -> Dict[str, List[str]]:
  12. """
  13. Input is a three letter ISO code for a country, or the countries name.
  14. The function tries to map the country name to an ISO code and then
  15. queries the folder mapping files for folders.
  16. Parameters
  17. ----------
  18. country_name: str
  19. String containing the country name or ISO 3 letter code
  20. print_sub: bool
  21. If True information on submissions will be written to stdout
  22. Returns
  23. -------
  24. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  25. Each value is a list of folders
  26. """
  27. codepath = Path(__file__).parent
  28. data_folder = codepath / ".." / ".." / "downloaded_data"
  29. # obtain country code
  30. #country_code = countrynames.to_code_3(country_name)
  31. try:
  32. country = pycountry.countries.search_fuzzy(country_name)
  33. except:
  34. raise ValueError(f"Country name {country_name} can not be mapped to "
  35. f"any country code")
  36. if len(country) > 1:
  37. raise ValueError(f"Country name {country_name} has {len(country)} "
  38. f"possible results for country codes.")
  39. country_code = country[0].alpha_3
  40. if print_sub:
  41. print(f"Country name {country_name} maps to ISO code {country_code}")
  42. country_submissions = {}
  43. if print_sub:
  44. print(f"#" * 80)
  45. print(f"The following submissions are available for {country_name}")
  46. for item in data_folder.iterdir():
  47. if item.is_dir():
  48. if print_sub:
  49. print("")
  50. print("-" * 80)
  51. print(f"Data folder {item.name}")
  52. print("-" * 80)
  53. with open(item / "folder_mapping.json", "r") as mapping_file:
  54. folder_mapping = json.load(mapping_file)
  55. if country_code in folder_mapping:
  56. country_folders = folder_mapping[country_code]
  57. if isinstance(country_folders, str):
  58. # only one folder
  59. country_folders = [country_folders]
  60. submission_folders = []
  61. for country_folder in country_folders:
  62. current_folder = item / country_folder
  63. if print_sub:
  64. print(f"Submissions in folder {country_folder}:")
  65. for submission_folder in current_folder.iterdir():
  66. if submission_folder.is_dir():
  67. if print_sub:
  68. print(submission_folder.name)
  69. submission_folders.append(submission_folder.name)
  70. country_submissions[item.name] = submission_folders
  71. else:
  72. print(f"No submissions available for {country_name}.")
  73. return country_submissions
  74. def get_country_datasets(
  75. country_name: str,
  76. print_ds: bool = True,
  77. ) -> Dict[str, List[str]]:
  78. """
  79. Input is a three letter ISO code for a country, or the country's name.
  80. The function tries to map the country name to an ISO code and then
  81. checks the code and data folders for content on the country.
  82. Parameters
  83. ----------
  84. country_name: str
  85. String containing the country name or ISO 3 letter code
  86. print_ds: bool
  87. If True information on submissions will be written to stdout
  88. Returns
  89. -------
  90. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  91. Each value is a list of folders
  92. """
  93. codepath = Path(__file__).parent
  94. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  95. rootpath = codepath / ".." / ".."
  96. rootpath = rootpath.resolve()
  97. data_folder = rootpath / "extracted_data"
  98. data_folder_legacy = rootpath / "legacy_data"
  99. # obtain country code
  100. country_code = countrynames.to_code_3(country_name)
  101. if country_code is None:
  102. raise ValueError(f"Country name {country_name} can not be mapped to "
  103. f"any country code")
  104. if print_ds:
  105. print(f"Country name {country_name} maps to ISO code {country_code}")
  106. rep_data = {}
  107. # data
  108. if print_ds:
  109. print(f"#" * 80)
  110. print(f"The following datasets are available for {country_name}")
  111. for item in data_folder.iterdir():
  112. if item.is_dir():
  113. cleaned_datasets_current_folder = {}
  114. if print_ds:
  115. print("-" * 80)
  116. print(f"Data folder {item.name}")
  117. print("-" * 80)
  118. with open(item / "folder_mapping.json", "r") as mapping_file:
  119. folder_mapping = json.load(mapping_file)
  120. if country_code not in folder_mapping:
  121. if print_ds:
  122. print("No data available")
  123. print("")
  124. else:
  125. country_folder = folder_mapping[country_code]
  126. if not isinstance(country_folder, str):
  127. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  128. datasets_current_folder = {}
  129. current_folder = item / country_folder
  130. for data_file in current_folder.iterdir():
  131. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  132. if data_file.stem in datasets_current_folder:
  133. datasets_current_folder[data_file.stem].append(data_file.suffix)
  134. else:
  135. datasets_current_folder[data_file.stem] = [data_file.suffix]
  136. for dataset in datasets_current_folder:
  137. # process filename to get submission
  138. parts = dataset.split('_')
  139. if parts[0] != country_code:
  140. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  141. else:
  142. terminology = "_".join(parts[3 : ])
  143. key = f"{parts[1]} ({parts[2]}, {terminology})"
  144. data_info = ""
  145. if '.nc' in datasets_current_folder[dataset]:
  146. data_info = data_info + "NF (.nc), "
  147. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  148. data_info = data_info + "IF (.yaml + .csv), "
  149. elif '.csv' in datasets_current_folder[dataset]:
  150. data_info = data_info + "incomplete IF? (.csv), "
  151. elif '.yaml' in datasets_current_folder[dataset]:
  152. data_info = data_info + "incomplete IF (.yaml), "
  153. code_file = get_code_file(country_code, parts[1])
  154. if code_file:
  155. data_info = data_info + f"code: {code_file.name}"
  156. else:
  157. data_info = data_info + f"code: not found"
  158. cleaned_datasets_current_folder[key] = data_info
  159. if print_ds:
  160. if cleaned_datasets_current_folder:
  161. for country_ds in cleaned_datasets_current_folder:
  162. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  163. else:
  164. print("No data available")
  165. print("")
  166. rep_data[item.name] = cleaned_datasets_current_folder
  167. # legacy data
  168. if print_ds:
  169. print(f"#" * 80)
  170. print(f"The following legacy datasets are available for {country_name}")
  171. legacy_data = {}
  172. for item in data_folder_legacy.iterdir():
  173. if item.is_dir():
  174. cleaned_datasets_current_folder = {}
  175. if print_ds:
  176. print("-" * 80)
  177. print(f"Data folder {item.name}")
  178. print("-" * 80)
  179. with open(item / "folder_mapping.json", "r") as mapping_file:
  180. folder_mapping = json.load(mapping_file)
  181. if country_code not in folder_mapping:
  182. if print_ds:
  183. print("No data available")
  184. print("")
  185. else:
  186. country_folder = folder_mapping[country_code]
  187. if not isinstance(country_folder, str):
  188. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  189. datasets_current_folder = {}
  190. current_folder = item / country_folder
  191. for data_file in current_folder.iterdir():
  192. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  193. if data_file.stem in datasets_current_folder:
  194. datasets_current_folder[data_file.stem].append(data_file.suffix)
  195. else:
  196. datasets_current_folder[data_file.stem] = [data_file.suffix]
  197. for dataset in datasets_current_folder:
  198. # process filename to get submission
  199. parts = dataset.split('_')
  200. if parts[0] != country_code:
  201. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  202. else:
  203. terminology = "_".join(parts[3 : ])
  204. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  205. data_info = ""
  206. if '.nc' in datasets_current_folder[dataset]:
  207. data_info = data_info + "NF (.nc), "
  208. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  209. data_info = data_info + "IF (.yaml + .csv), "
  210. elif '.csv' in datasets_current_folder[dataset]:
  211. data_info = data_info + "incomplete IF? (.csv), "
  212. elif '.yaml' in datasets_current_folder[dataset]:
  213. data_info = data_info + "incomplete IF (.yaml), "
  214. cleaned_datasets_current_folder[key] = data_info
  215. if print_ds:
  216. if cleaned_datasets_current_folder:
  217. for country_ds in cleaned_datasets_current_folder:
  218. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  219. else:
  220. print("No data available")
  221. print("")
  222. legacy_data[item.name] = cleaned_datasets_current_folder
  223. all_data = {
  224. "rep_data": rep_data,
  225. "legacy_data": legacy_data,
  226. }
  227. return all_data
  228. def get_possible_inputs(
  229. country_name: str,
  230. submission: str,
  231. print_info: bool = False,
  232. ) -> List[Path]:
  233. """
  234. For given country name and submission find the possible input files
  235. Parameters
  236. ----------
  237. country_name: str
  238. String containing the country name or ISO 3 letter code
  239. submission: str
  240. String of the submission
  241. print_info: bool = False
  242. If True print information on code found
  243. Returns
  244. -------
  245. returns a list pathlib Path objects for the input files
  246. """
  247. codepath = Path(__file__).parent
  248. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  249. rootpath = codepath / ".." / ".."
  250. rootpath = rootpath.resolve()
  251. data_folder = rootpath / "downloaded_data"
  252. # obtain country code
  253. country_code = countrynames.to_code_3(country_name)
  254. if country_code is None:
  255. raise ValueError(f"Country name {country_name} can not be mapped to "
  256. f"any country code")
  257. if print_info:
  258. print(f"Country name {country_name} maps to ISO code {country_code}")
  259. input_files = []
  260. for item in data_folder.iterdir():
  261. if item.is_dir():
  262. with open(item / "folder_mapping.json", "r") as mapping_file:
  263. folder_mapping = json.load(mapping_file)
  264. if country_code in folder_mapping:
  265. country_folders = folder_mapping[country_code]
  266. if isinstance(country_folders, str):
  267. # only one folder
  268. country_folders = [country_folders]
  269. for country_folder in country_folders:
  270. input_folder = item / country_folder / submission
  271. if input_folder.exists():
  272. for filepath in input_folder.glob("*"):
  273. input_files.append(filepath.relative_to(rootpath))
  274. if print_info:
  275. if input_files:
  276. print(f"Found possible input files:")
  277. for file in input_files:
  278. print(file)
  279. else:
  280. print(f"No input files found")
  281. return input_files
  282. def get_possible_outputs(
  283. country_name: str,
  284. submission: str,
  285. print_info: bool = False,
  286. )-> List[Path]:
  287. """
  288. For given country name and submission find the possible output files
  289. Parameters
  290. ----------
  291. country_name: str
  292. String containing the country name or ISO 3 letter code
  293. submission: str
  294. String of the submission
  295. print_info: bool = False
  296. If True print information on outputs found
  297. Returns
  298. -------
  299. returns a list pathlib Path objects for the input files
  300. """
  301. codepath = Path(__file__).parent
  302. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  303. rootpath = codepath / ".." / ".."
  304. rootpath = rootpath.resolve()
  305. data_folder = rootpath / "extracted_data"
  306. # obtain country code
  307. country_code = countrynames.to_code_3(country_name)
  308. if country_code is None:
  309. raise ValueError(f"Country name {country_name} can not be mapped to "
  310. f"any country code")
  311. if print_info:
  312. print(f"Country name {country_name} maps to ISO code {country_code}")
  313. output_files = []
  314. for item in data_folder.iterdir():
  315. if item.is_dir():
  316. with open(item / "folder_mapping.json", "r") as mapping_file:
  317. folder_mapping = json.load(mapping_file)
  318. if country_code in folder_mapping:
  319. country_folder = folder_mapping[country_code]
  320. if not isinstance(country_folder, str):
  321. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  322. output_folder = item / country_folder
  323. if output_folder.exists():
  324. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  325. output_files.append(filepath.relative_to(rootpath))
  326. if print_info:
  327. if output_files:
  328. print(f"Found possible output files:")
  329. for file in output_files:
  330. print(file)
  331. else:
  332. print(f"No output files found")
  333. return output_files
  334. def get_code_file(
  335. country_name: str,
  336. submission: str,
  337. print_info: bool = False,
  338. ) -> Path:
  339. """
  340. For given country name and submission find the script that creates the data
  341. Parameters
  342. ----------
  343. country_name: str
  344. String containing the country name or ISO 3 letter code
  345. submission: str
  346. String of the submission
  347. print_info: bool = False
  348. If True print information on code found
  349. Returns
  350. -------
  351. returns a pathlib Path object for the code file
  352. """
  353. codepath = Path(__file__).parent
  354. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  355. rootpath = codepath / ".." / ".."
  356. rootpath = rootpath.resolve()
  357. code_file_path = None
  358. # obtain country code
  359. country_code = countrynames.to_code_3(country_name)
  360. if country_code is None:
  361. raise ValueError(f"Country name {country_name} can not be mapped to "
  362. f"any country code")
  363. if print_info:
  364. print(f"Country name {country_name} maps to ISO code {country_code}")
  365. with open(codepath / "folder_mapping.json", "r") as mapping_file:
  366. folder_mapping = json.load(mapping_file)
  367. if country_code not in folder_mapping:
  368. if print_info:
  369. print("No code available")
  370. print("")
  371. else:
  372. country_folder = codepath / folder_mapping[country_code]
  373. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  374. for file in country_folder.iterdir():
  375. if file.match(code_file_name_candidate):
  376. if code_file_path is not None:
  377. raise ValueError(f"Found multiple code candidates: "
  378. f"{code_file_path} and file.name. "
  379. f"Please use only one file with name "
  380. f"'read_ISO3_submission_XXX.YYY'.")
  381. else:
  382. if print_info:
  383. print(f"Found code file {file.relative_to(rootpath)}")
  384. code_file_path = file
  385. if code_file_path is not None:
  386. return code_file_path.relative_to(rootpath)
  387. else:
  388. return None
  389. def create_folder_mapping(
  390. folder: str,
  391. extracted: bool = False
  392. ) -> None:
  393. """
  394. Create a mapping from 3 letter ISO country codes to folders
  395. based on the subfolders of the given folder. The mapping is
  396. stored in 'folder_mapping.json' in the given folder. Folder
  397. must be given relative to the repository root
  398. Parameters
  399. ----------
  400. folder: str
  401. folder to create the mapping for
  402. extracted: bool = False
  403. If true treat the folder as extracted data, where we
  404. only have one folder per country and no typos in the
  405. names
  406. Returns
  407. -------
  408. Nothing
  409. """
  410. codepath = Path(__file__).parent
  411. rootpath = codepath / ".." / ".."
  412. rootpath = rootpath.resolve()
  413. folder = rootpath / folder
  414. if extracted:
  415. folder_mapping = {}
  416. else:
  417. folder_mapping = {
  418. 'VEN': 'Venezeula_(Bolivarian_Republic_of)',
  419. 'FSM': 'Micronesia_(Federated_State_of)',
  420. 'MKD': 'The_Republic_of_North_Macedonia',
  421. }
  422. known_folders = list(folder_mapping.values())
  423. for item in folder.iterdir():
  424. if item.is_dir():
  425. ISO3 = countrynames.to_code_3(item.name)
  426. if ISO3 is None:
  427. if item.name not in known_folders:
  428. print(folder_mapping.values())
  429. print(f"No match for {item.name}")
  430. else:
  431. known_folders.append(item.name)
  432. if ISO3 in folder_mapping.keys():
  433. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  434. else:
  435. folder_mapping[ISO3] = item.name
  436. with open(folder / "folder_mapping.json", "w") as mapping_file:
  437. json.dump(folder_mapping, mapping_file, indent=4)