get_submissions_info.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. def get_country_submissions(
  9. country_name: str,
  10. print_sub: bool = True,
  11. ) -> Dict[str, List[str]]:
  12. """
  13. Input is a three letter ISO code for a country, or the countries name.
  14. The function tries to map the country name to an ISO code and then
  15. queries the folder mapping files for folders.
  16. Parameters
  17. ----------
  18. country_name: str
  19. String containing the country name or ISO 3 letter code
  20. print_sub: bool
  21. If True information on submissions will be written to stdout
  22. Returns
  23. -------
  24. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  25. Each value is a list of folders
  26. """
  27. codepath = Path(__file__).parent
  28. data_folder = codepath / ".." / ".." / "downloaded_data"
  29. country_code = get_country_code(country_name)
  30. if print_sub:
  31. print(f"Country name {country_name} maps to ISO code {country_code}")
  32. country_submissions = {}
  33. if print_sub:
  34. print(f"#" * 80)
  35. print(f"The following submissions are available for {country_name}")
  36. for item in data_folder.iterdir():
  37. if item.is_dir():
  38. if print_sub:
  39. print("")
  40. print("-" * 80)
  41. print(f"Data folder {item.name}")
  42. print("-" * 80)
  43. with open(item / "folder_mapping.json", "r") as mapping_file:
  44. folder_mapping = json.load(mapping_file)
  45. if country_code in folder_mapping:
  46. country_folders = folder_mapping[country_code]
  47. if isinstance(country_folders, str):
  48. # only one folder
  49. country_folders = [country_folders]
  50. submission_folders = []
  51. for country_folder in country_folders:
  52. current_folder = item / country_folder
  53. if print_sub:
  54. print(f"Submissions in folder {country_folder}:")
  55. for submission_folder in current_folder.iterdir():
  56. if submission_folder.is_dir():
  57. if print_sub:
  58. print(submission_folder.name)
  59. submission_folders.append(submission_folder.name)
  60. country_submissions[item.name] = submission_folders
  61. else:
  62. print(f"No submissions available for {country_name}.")
  63. return country_submissions
  64. def get_country_datasets(
  65. country_name: str,
  66. print_ds: bool = True,
  67. ) -> Dict[str, List[str]]:
  68. """
  69. Input is a three letter ISO code for a country, or the country's name.
  70. The function tries to map the country name to an ISO code and then
  71. checks the code and data folders for content on the country.
  72. Parameters
  73. ----------
  74. country_name: str
  75. String containing the country name or ISO 3 letter code
  76. print_ds: bool
  77. If True information on submissions will be written to stdout
  78. Returns
  79. -------
  80. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  81. Each value is a list of folders
  82. """
  83. codepath = Path(__file__).parent
  84. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  85. rootpath = codepath / ".." / ".."
  86. rootpath = rootpath.resolve()
  87. data_folder = rootpath / "extracted_data"
  88. data_folder_legacy = rootpath / "legacy_data"
  89. # obtain country code
  90. country_code = get_country_code(country_name)
  91. if print_ds:
  92. print(f"Country name {country_name} maps to ISO code {country_code}")
  93. rep_data = {}
  94. # data
  95. if print_ds:
  96. print(f"#" * 80)
  97. print(f"The following datasets are available for {country_name}")
  98. for item in data_folder.iterdir():
  99. if item.is_dir():
  100. cleaned_datasets_current_folder = {}
  101. if print_ds:
  102. print("-" * 80)
  103. print(f"Data folder {item.name}")
  104. print("-" * 80)
  105. with open(item / "folder_mapping.json", "r") as mapping_file:
  106. folder_mapping = json.load(mapping_file)
  107. if country_code not in folder_mapping:
  108. if print_ds:
  109. print("No data available")
  110. print("")
  111. else:
  112. country_folder = folder_mapping[country_code]
  113. if not isinstance(country_folder, str):
  114. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  115. datasets_current_folder = {}
  116. current_folder = item / country_folder
  117. for data_file in current_folder.iterdir():
  118. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  119. if data_file.stem in datasets_current_folder:
  120. datasets_current_folder[data_file.stem].append(data_file.suffix)
  121. else:
  122. datasets_current_folder[data_file.stem] = [data_file.suffix]
  123. for dataset in datasets_current_folder:
  124. # process filename to get submission
  125. parts = dataset.split('_')
  126. if parts[0] != country_code:
  127. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  128. else:
  129. terminology = "_".join(parts[3 : ])
  130. key = f"{parts[1]} ({parts[2]}, {terminology})"
  131. data_info = ""
  132. if '.nc' in datasets_current_folder[dataset]:
  133. data_info = data_info + "NF (.nc), "
  134. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  135. data_info = data_info + "IF (.yaml + .csv), "
  136. elif '.csv' in datasets_current_folder[dataset]:
  137. data_info = data_info + "incomplete IF? (.csv), "
  138. elif '.yaml' in datasets_current_folder[dataset]:
  139. data_info = data_info + "incomplete IF (.yaml), "
  140. code_file = get_code_file(country_code, parts[1])
  141. if code_file:
  142. data_info = data_info + f"code: {code_file.name}"
  143. else:
  144. data_info = data_info + f"code: not found"
  145. cleaned_datasets_current_folder[key] = data_info
  146. if print_ds:
  147. if cleaned_datasets_current_folder:
  148. for country_ds in cleaned_datasets_current_folder:
  149. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  150. else:
  151. print("No data available")
  152. print("")
  153. rep_data[item.name] = cleaned_datasets_current_folder
  154. # legacy data
  155. if print_ds:
  156. print(f"#" * 80)
  157. print(f"The following legacy datasets are available for {country_name}")
  158. legacy_data = {}
  159. for item in data_folder_legacy.iterdir():
  160. if item.is_dir():
  161. cleaned_datasets_current_folder = {}
  162. if print_ds:
  163. print("-" * 80)
  164. print(f"Data folder {item.name}")
  165. print("-" * 80)
  166. with open(item / "folder_mapping.json", "r") as mapping_file:
  167. folder_mapping = json.load(mapping_file)
  168. if country_code not in folder_mapping:
  169. if print_ds:
  170. print("No data available")
  171. print("")
  172. else:
  173. country_folder = folder_mapping[country_code]
  174. if not isinstance(country_folder, str):
  175. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  176. datasets_current_folder = {}
  177. current_folder = item / country_folder
  178. for data_file in current_folder.iterdir():
  179. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  180. if data_file.stem in datasets_current_folder:
  181. datasets_current_folder[data_file.stem].append(data_file.suffix)
  182. else:
  183. datasets_current_folder[data_file.stem] = [data_file.suffix]
  184. for dataset in datasets_current_folder:
  185. # process filename to get submission
  186. parts = dataset.split('_')
  187. if parts[0] != country_code:
  188. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  189. else:
  190. terminology = "_".join(parts[3 : ])
  191. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  192. data_info = ""
  193. if '.nc' in datasets_current_folder[dataset]:
  194. data_info = data_info + "NF (.nc), "
  195. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  196. data_info = data_info + "IF (.yaml + .csv), "
  197. elif '.csv' in datasets_current_folder[dataset]:
  198. data_info = data_info + "incomplete IF? (.csv), "
  199. elif '.yaml' in datasets_current_folder[dataset]:
  200. data_info = data_info + "incomplete IF (.yaml), "
  201. cleaned_datasets_current_folder[key] = data_info
  202. if print_ds:
  203. if cleaned_datasets_current_folder:
  204. for country_ds in cleaned_datasets_current_folder:
  205. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  206. else:
  207. print("No data available")
  208. print("")
  209. legacy_data[item.name] = cleaned_datasets_current_folder
  210. all_data = {
  211. "rep_data": rep_data,
  212. "legacy_data": legacy_data,
  213. }
  214. return all_data
  215. def get_country_code(
  216. country_name: str,
  217. )->str:
  218. """
  219. obtain country code. If the input is a code it will be returned, if the input
  220. is not a three letter code a search will be performed
  221. Parameters
  222. __________
  223. country_name: str
  224. Country code or name to get the three-letter code for.
  225. """
  226. try:
  227. # check if it's a 3 letter code
  228. country = pycountry.countries.get(alpha_3=country_name)
  229. country_code = country.alpha_3
  230. except:
  231. try:
  232. country = pycountry.countries.search_fuzzy(country_name)
  233. except:
  234. raise ValueError(f"Country name {country_name} can not be mapped to "
  235. f"any country code")
  236. if len(country) > 1:
  237. country_code = None
  238. for current_country in country:
  239. if current_country.name == country_name:
  240. country_code = current_country.alpha_3
  241. if country_code is None:
  242. raise ValueError(f"Country name {country_name} has {len(country)} "
  243. f"possible results for country codes.")
  244. country_code = country[0].alpha_3
  245. return country_code
  246. def get_possible_inputs(
  247. country_name: str,
  248. submission: str,
  249. print_info: bool = False,
  250. ) -> List[Path]:
  251. """
  252. For given country name and submission find the possible input files
  253. Parameters
  254. ----------
  255. country_name: str
  256. String containing the country name or ISO 3 letter code
  257. submission: str
  258. String of the submission
  259. print_info: bool = False
  260. If True print information on code found
  261. Returns
  262. -------
  263. returns a list pathlib Path objects for the input files
  264. """
  265. codepath = Path(__file__).parent
  266. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  267. rootpath = codepath / ".." / ".."
  268. rootpath = rootpath.resolve()
  269. data_folder = rootpath / "downloaded_data"
  270. # obtain country code
  271. country_code = get_country_code(country_name)
  272. if print_info:
  273. print(f"Country name {country_name} maps to ISO code {country_code}")
  274. input_files = []
  275. for item in data_folder.iterdir():
  276. if item.is_dir():
  277. with open(item / "folder_mapping.json", "r") as mapping_file:
  278. folder_mapping = json.load(mapping_file)
  279. if country_code in folder_mapping:
  280. country_folders = folder_mapping[country_code]
  281. if isinstance(country_folders, str):
  282. # only one folder
  283. country_folders = [country_folders]
  284. for country_folder in country_folders:
  285. input_folder = item / country_folder / submission
  286. if input_folder.exists():
  287. for filepath in input_folder.glob("*"):
  288. input_files.append(filepath.relative_to(rootpath))
  289. if print_info:
  290. if input_files:
  291. print(f"Found possible input files:")
  292. for file in input_files:
  293. print(file)
  294. else:
  295. print(f"No input files found")
  296. return input_files
  297. def get_possible_outputs(
  298. country_name: str,
  299. submission: str,
  300. print_info: bool = False,
  301. )-> List[Path]:
  302. """
  303. For given country name and submission find the possible output files
  304. Parameters
  305. ----------
  306. country_name: str
  307. String containing the country name or ISO 3 letter code
  308. submission: str
  309. String of the submission
  310. print_info: bool = False
  311. If True print information on outputs found
  312. Returns
  313. -------
  314. returns a list pathlib Path objects for the input files
  315. """
  316. codepath = Path(__file__).parent
  317. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  318. rootpath = codepath / ".." / ".."
  319. rootpath = rootpath.resolve()
  320. data_folder = rootpath / "extracted_data"
  321. # obtain country code
  322. country_code = get_country_code(country_name)
  323. if print_info:
  324. print(f"Country name {country_name} maps to ISO code {country_code}")
  325. output_files = []
  326. for item in data_folder.iterdir():
  327. if item.is_dir():
  328. with open(item / "folder_mapping.json", "r") as mapping_file:
  329. folder_mapping = json.load(mapping_file)
  330. if country_code in folder_mapping:
  331. country_folder = folder_mapping[country_code]
  332. if not isinstance(country_folder, str):
  333. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  334. output_folder = item / country_folder
  335. if output_folder.exists():
  336. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  337. output_files.append(filepath.relative_to(rootpath))
  338. if print_info:
  339. if output_files:
  340. print(f"Found possible output files:")
  341. for file in output_files:
  342. print(file)
  343. else:
  344. print(f"No output files found")
  345. return output_files
  346. def get_code_file(
  347. country_name: str,
  348. submission: str,
  349. print_info: bool = False,
  350. ) -> Path:
  351. """
  352. For given country name and submission find the script that creates the data
  353. Parameters
  354. ----------
  355. country_name: str
  356. String containing the country name or ISO 3 letter code
  357. submission: str
  358. String of the submission
  359. print_info: bool = False
  360. If True print information on code found
  361. Returns
  362. -------
  363. returns a pathlib Path object for the code file
  364. """
  365. codepath = Path(__file__).parent
  366. #codepath = Path(os.getcwd()) / ".." / "code" / "UNFCCC_reader"
  367. rootpath = codepath / ".." / ".."
  368. rootpath = rootpath.resolve()
  369. code_file_path = None
  370. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  371. # so we return the path to that.
  372. if submission[0:3] == "CRF":
  373. return rootpath / "UNFCCC_CRF_reader"
  374. # obtain country code
  375. country_code = get_country_code(country_name)
  376. if print_info:
  377. print(f"Country name {country_name} maps to ISO code {country_code}")
  378. with open(codepath / "folder_mapping.json", "r") as mapping_file:
  379. folder_mapping = json.load(mapping_file)
  380. if country_code not in folder_mapping:
  381. if print_info:
  382. print("No code available")
  383. print("")
  384. else:
  385. country_folder = codepath / folder_mapping[country_code]
  386. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  387. for file in country_folder.iterdir():
  388. if file.match(code_file_name_candidate):
  389. if code_file_path is not None:
  390. raise ValueError(f"Found multiple code candidates: "
  391. f"{code_file_path} and file.name. "
  392. f"Please use only one file with name "
  393. f"'read_ISO3_submission_XXX.YYY'.")
  394. else:
  395. if print_info:
  396. print(f"Found code file {file.relative_to(rootpath)}")
  397. code_file_path = file
  398. if code_file_path is not None:
  399. return code_file_path.relative_to(rootpath)
  400. else:
  401. return None
  402. def create_folder_mapping(
  403. folder: str,
  404. extracted: bool = False
  405. ) -> None:
  406. """
  407. Create a mapping from 3 letter ISO country codes to folders
  408. based on the subfolders of the given folder. The mapping is
  409. stored in 'folder_mapping.json' in the given folder. Folder
  410. must be given relative to the repository root
  411. Parameters
  412. ----------
  413. folder: str
  414. folder to create the mapping for
  415. extracted: bool = False
  416. If true treat the folder as extracted data, where we
  417. only have one folder per country and no typos in the
  418. names
  419. Returns
  420. -------
  421. Nothing
  422. """
  423. codepath = Path(__file__).parent
  424. rootpath = codepath / ".." / ".."
  425. rootpath = rootpath.resolve()
  426. folder = rootpath / folder
  427. if extracted:
  428. folder_mapping = {}
  429. else:
  430. folder_mapping = {
  431. 'VEN': 'Venezeula_(Bolivarian_Republic_of)',
  432. 'FSM': 'Micronesia_(Federated_State_of)',
  433. 'MKD': 'The_Republic_of_North_Macedonia',
  434. }
  435. known_folders = list(folder_mapping.values())
  436. for item in folder.iterdir():
  437. if item.is_dir():
  438. try:
  439. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  440. if len(country) > 1:
  441. ISO3 = None
  442. for current_country in country:
  443. if current_country.name == item.name.replace("_", " "):
  444. ISO3 = current_country.alpha_3
  445. else:
  446. ISO3 = country[0].alpha_3
  447. except:
  448. ISO3 = None
  449. if ISO3 is None:
  450. if item.name not in known_folders:
  451. print(folder_mapping.values())
  452. print(f"No match for {item.name}")
  453. else:
  454. known_folders.append(item.name)
  455. if ISO3 in folder_mapping.keys():
  456. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  457. else:
  458. folder_mapping[ISO3] = item.name
  459. with open(folder / "folder_mapping.json", "w") as mapping_file:
  460. json.dump(folder_mapping, mapping_file, indent=4)