get_submissions_info.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. root_path = Path(__file__).parents[2].absolute()
  9. root_path = root_path.resolve()
  10. code_path = root_path / "code" / "UNFCCC_reader"
  11. # beware, folders below are different than for CRF reader
  12. downloaded_data_path = root_path / "downloaded_data"
  13. extracted_data_path = root_path / "extracted_data"
  14. legacy_data_path = root_path / "legacy_data"
  15. # TODO: move this to general util package
  16. custom_country_mapping = {
  17. "EUA": "European Union",
  18. "EUC": "European Union",
  19. "FRK": "France",
  20. "DKE": "Denmark",
  21. "DNM": "Denmark",
  22. "GBK": "United Kingdom of Great Britain and Northern Ireland",
  23. }
  24. custom_folders = {
  25. 'Venezeula_(Bolivarian_Republic_of)': 'VEN',
  26. 'Venezuela_(Bolivarian_Republic_of)': 'VEN',
  27. 'Micronesia_(Federated_State_of)': 'FSM',
  28. 'Micronesia_(Federated_States_of)': 'FSM',
  29. 'The_Republic_of_North_Macedonia': 'MKD',
  30. 'Republic_of_Korea': 'KOR',
  31. 'Bolivia_(Plurinational_State_of)': 'BOL',
  32. 'Türkiye': 'TUR',
  33. 'Iran_(Islamic_Republic_of)': 'IRN',
  34. 'Côte_d’Ivoire': 'CIV',
  35. 'Democratic_Republic_of_the_Congo': "COD",
  36. 'European_Union': 'EUA',
  37. }
  38. def get_country_submissions(
  39. country_name: str,
  40. print_sub: bool = True,
  41. ) -> Dict[str, List[str]]:
  42. """
  43. Input is a three letter ISO code for a country, or the countries name.
  44. The function tries to map the country name to an ISO code and then
  45. queries the folder mapping files for folders.
  46. Parameters
  47. ----------
  48. country_name: str
  49. String containing the country name or ISO 3 letter code
  50. print_sub: bool
  51. If True information on submissions will be written to stdout
  52. Returns
  53. -------
  54. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  55. Each value is a list of folders
  56. """
  57. data_folder = downloaded_data_path
  58. country_code = get_country_code(country_name)
  59. if print_sub:
  60. print(f"Country name {country_name} maps to ISO code {country_code}")
  61. country_submissions = {}
  62. if print_sub:
  63. print(f"#" * 80)
  64. print(f"The following submissions are available for {country_name}")
  65. for item in data_folder.iterdir():
  66. if item.is_dir():
  67. if print_sub:
  68. print("")
  69. print("-" * 80)
  70. print(f"Data folder {item.name}")
  71. print("-" * 80)
  72. with open(item / "folder_mapping.json", "r") as mapping_file:
  73. folder_mapping = json.load(mapping_file)
  74. if country_code in folder_mapping:
  75. country_folders = folder_mapping[country_code]
  76. if isinstance(country_folders, str):
  77. # only one folder
  78. country_folders = [country_folders]
  79. submission_folders = []
  80. for country_folder in country_folders:
  81. current_folder = item / country_folder
  82. if print_sub:
  83. print(f"Submissions in folder {country_folder}:")
  84. for submission_folder in current_folder.iterdir():
  85. if submission_folder.is_dir():
  86. if print_sub:
  87. print(submission_folder.name)
  88. submission_folders.append(submission_folder.name)
  89. country_submissions[item.name] = submission_folders
  90. else:
  91. print(f"No submissions available for {country_name}.")
  92. return country_submissions
  93. def get_country_datasets(
  94. country_name: str,
  95. print_ds: bool = True,
  96. ) -> Dict[str, List[str]]:
  97. """
  98. Input is a three letter ISO code for a country, or the country's name.
  99. The function tries to map the country name to an ISO code and then
  100. checks the code and data folders for content on the country.
  101. Parameters
  102. ----------
  103. country_name: str
  104. String containing the country name or ISO 3 letter code
  105. print_ds: bool
  106. If True information on submissions will be written to stdout
  107. Returns
  108. -------
  109. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  110. Each value is a list of folders
  111. """
  112. data_folder = extracted_data_path
  113. data_folder_legacy = legacy_data_path
  114. # obtain country code
  115. country_code = get_country_code(country_name)
  116. if print_ds:
  117. print(f"Country name {country_name} maps to ISO code {country_code}")
  118. rep_data = {}
  119. # data
  120. if print_ds:
  121. print(f"#" * 80)
  122. print(f"The following datasets are available for {country_name}")
  123. for item in data_folder.iterdir():
  124. if item.is_dir():
  125. cleaned_datasets_current_folder = {}
  126. if print_ds:
  127. print("-" * 80)
  128. print(f"Data folder {item.name}")
  129. print("-" * 80)
  130. with open(item / "folder_mapping.json", "r") as mapping_file:
  131. folder_mapping = json.load(mapping_file)
  132. if country_code not in folder_mapping:
  133. if print_ds:
  134. print("No data available")
  135. print("")
  136. else:
  137. country_folder = folder_mapping[country_code]
  138. if not isinstance(country_folder, str):
  139. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  140. datasets_current_folder = {}
  141. current_folder = item / country_folder
  142. for data_file in current_folder.iterdir():
  143. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  144. if data_file.stem in datasets_current_folder:
  145. datasets_current_folder[data_file.stem].append(data_file.suffix)
  146. else:
  147. datasets_current_folder[data_file.stem] = [data_file.suffix]
  148. for dataset in datasets_current_folder:
  149. # process filename to get submission
  150. parts = dataset.split('_')
  151. if parts[0] != country_code:
  152. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  153. else:
  154. terminology = "_".join(parts[3 : ])
  155. key = f"{parts[1]} ({parts[2]}, {terminology})"
  156. data_info = ""
  157. if '.nc' in datasets_current_folder[dataset]:
  158. data_info = data_info + "NF (.nc), "
  159. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  160. data_info = data_info + "IF (.yaml + .csv), "
  161. elif '.csv' in datasets_current_folder[dataset]:
  162. data_info = data_info + "incomplete IF? (.csv), "
  163. elif '.yaml' in datasets_current_folder[dataset]:
  164. data_info = data_info + "incomplete IF (.yaml), "
  165. code_file = get_code_file(country_code, parts[1])
  166. if code_file:
  167. data_info = data_info + f"code: {code_file.name}"
  168. else:
  169. data_info = data_info + f"code: not found"
  170. cleaned_datasets_current_folder[key] = data_info
  171. if print_ds:
  172. if cleaned_datasets_current_folder:
  173. for country_ds in cleaned_datasets_current_folder:
  174. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  175. else:
  176. print("No data available")
  177. print("")
  178. rep_data[item.name] = cleaned_datasets_current_folder
  179. # legacy data
  180. if print_ds:
  181. print(f"#" * 80)
  182. print(f"The following legacy datasets are available for {country_name}")
  183. legacy_data = {}
  184. for item in data_folder_legacy.iterdir():
  185. if item.is_dir():
  186. cleaned_datasets_current_folder = {}
  187. if print_ds:
  188. print("-" * 80)
  189. print(f"Data folder {item.name}")
  190. print("-" * 80)
  191. with open(item / "folder_mapping.json", "r") as mapping_file:
  192. folder_mapping = json.load(mapping_file)
  193. if country_code not in folder_mapping:
  194. if print_ds:
  195. print("No data available")
  196. print("")
  197. else:
  198. country_folder = folder_mapping[country_code]
  199. if not isinstance(country_folder, str):
  200. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  201. datasets_current_folder = {}
  202. current_folder = item / country_folder
  203. for data_file in current_folder.iterdir():
  204. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  205. if data_file.stem in datasets_current_folder:
  206. datasets_current_folder[data_file.stem].append(data_file.suffix)
  207. else:
  208. datasets_current_folder[data_file.stem] = [data_file.suffix]
  209. for dataset in datasets_current_folder:
  210. # process filename to get submission
  211. parts = dataset.split('_')
  212. if parts[0] != country_code:
  213. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  214. else:
  215. terminology = "_".join(parts[3 : ])
  216. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  217. data_info = ""
  218. if '.nc' in datasets_current_folder[dataset]:
  219. data_info = data_info + "NF (.nc), "
  220. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  221. data_info = data_info + "IF (.yaml + .csv), "
  222. elif '.csv' in datasets_current_folder[dataset]:
  223. data_info = data_info + "incomplete IF? (.csv), "
  224. elif '.yaml' in datasets_current_folder[dataset]:
  225. data_info = data_info + "incomplete IF (.yaml), "
  226. cleaned_datasets_current_folder[key] = data_info
  227. if print_ds:
  228. if cleaned_datasets_current_folder:
  229. for country_ds in cleaned_datasets_current_folder:
  230. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  231. else:
  232. print("No data available")
  233. print("")
  234. legacy_data[item.name] = cleaned_datasets_current_folder
  235. all_data = {
  236. "rep_data": rep_data,
  237. "legacy_data": legacy_data,
  238. }
  239. return all_data
  240. def get_country_code(
  241. country_name: str,
  242. )->str:
  243. """
  244. obtain country code. If the input is a code it will be returned, if the input
  245. is not a three letter code a search will be performed
  246. Parameters
  247. __________
  248. country_name: str
  249. Country code or name to get the three-letter code for.
  250. """
  251. # First check if it's in the list of custom codes
  252. if country_name in custom_country_mapping:
  253. country_code = country_name
  254. else:
  255. try:
  256. # check if it's a 3 letter code
  257. country = pycountry.countries.get(alpha_3=country_name)
  258. country_code = country.alpha_3
  259. except:
  260. try:
  261. country = pycountry.countries.search_fuzzy(country_name.replace("_", " "))
  262. except:
  263. raise ValueError(f"Country name {country_name} can not be mapped to "
  264. f"any country code. Try using the ISO3 code directly.")
  265. if len(country) > 1:
  266. country_code = None
  267. for current_country in country:
  268. if current_country.name == country_name:
  269. country_code = current_country.alpha_3
  270. if country_code is None:
  271. raise ValueError(f"Country name {country_name} has {len(country)} "
  272. f"possible results for country codes.")
  273. country_code = country[0].alpha_3
  274. return country_code
  275. def get_possible_inputs(
  276. country_name: str,
  277. submission: str,
  278. print_info: bool = False,
  279. ) -> List[Path]:
  280. """
  281. For given country name and submission find the possible input files
  282. Parameters
  283. ----------
  284. country_name: str
  285. String containing the country name or ISO 3 letter code
  286. submission: str
  287. String of the submission
  288. print_info: bool = False
  289. If True print information on code found
  290. Returns
  291. -------
  292. returns a list pathlib Path objects for the input files
  293. """
  294. data_folder = downloaded_data_path
  295. # obtain country code
  296. country_code = get_country_code(country_name)
  297. if print_info:
  298. print(f"Country name {country_name} maps to ISO code {country_code}")
  299. input_files = []
  300. for item in data_folder.iterdir():
  301. if item.is_dir():
  302. with open(item / "folder_mapping.json", "r") as mapping_file:
  303. folder_mapping = json.load(mapping_file)
  304. if country_code in folder_mapping:
  305. country_folders = folder_mapping[country_code]
  306. if isinstance(country_folders, str):
  307. # only one folder
  308. country_folders = [country_folders]
  309. for country_folder in country_folders:
  310. input_folder = item / country_folder / submission
  311. if input_folder.exists():
  312. for filepath in input_folder.glob("*"):
  313. input_files.append(filepath.relative_to(root_path))
  314. if print_info:
  315. if input_files:
  316. print(f"Found possible input files:")
  317. for file in input_files:
  318. print(file)
  319. else:
  320. print(f"No input files found")
  321. return input_files
  322. def get_possible_outputs(
  323. country_name: str,
  324. submission: str,
  325. print_info: bool = False,
  326. )-> List[Path]:
  327. """
  328. For given country name and submission find the possible output files
  329. Parameters
  330. ----------
  331. country_name: str
  332. String containing the country name or ISO 3 letter code
  333. submission: str
  334. String of the submission
  335. print_info: bool = False
  336. If True print information on outputs found
  337. Returns
  338. -------
  339. returns a list pathlib Path objects for the input files
  340. """
  341. data_folder = extracted_data_path
  342. # obtain country code
  343. country_code = get_country_code(country_name)
  344. if print_info:
  345. print(f"Country name {country_name} maps to ISO code {country_code}")
  346. output_files = []
  347. for item in data_folder.iterdir():
  348. if item.is_dir():
  349. with open(item / "folder_mapping.json", "r") as mapping_file:
  350. folder_mapping = json.load(mapping_file)
  351. if country_code in folder_mapping:
  352. country_folder = folder_mapping[country_code]
  353. if not isinstance(country_folder, str):
  354. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  355. output_folder = item / country_folder
  356. if output_folder.exists():
  357. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  358. output_files.append(filepath.relative_to(root_path))
  359. if print_info:
  360. if output_files:
  361. print(f"Found possible output files:")
  362. for file in output_files:
  363. print(file)
  364. else:
  365. print(f"No output files found")
  366. return output_files
  367. def get_code_file(
  368. country_name: str,
  369. submission: str,
  370. print_info: bool = False,
  371. ) -> Path:
  372. """
  373. For given country name and submission find the script that creates the data
  374. Parameters
  375. ----------
  376. country_name: str
  377. String containing the country name or ISO 3 letter code
  378. submission: str
  379. String of the submission
  380. print_info: bool = False
  381. If True print information on code found
  382. Returns
  383. -------
  384. returns a pathlib Path object for the code file
  385. """
  386. code_file_path = None
  387. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  388. # so we return the path to that.
  389. if submission[0:3] == "CRF":
  390. return root_path / "UNFCCC_CRF_reader"
  391. # obtain country code
  392. country_code = get_country_code(country_name)
  393. if print_info:
  394. print(f"Country name {country_name} maps to ISO code {country_code}")
  395. with open(code_path / "folder_mapping.json", "r") as mapping_file:
  396. folder_mapping = json.load(mapping_file)
  397. if country_code not in folder_mapping:
  398. if print_info:
  399. print("No code available")
  400. print("")
  401. else:
  402. country_folder = code_path / folder_mapping[country_code]
  403. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  404. for file in country_folder.iterdir():
  405. if file.match(code_file_name_candidate):
  406. if code_file_path is not None:
  407. raise ValueError(f"Found multiple code candidates: "
  408. f"{code_file_path} and file.name. "
  409. f"Please use only one file with name "
  410. f"'read_ISO3_submission_XXX.YYY'.")
  411. else:
  412. if print_info:
  413. print(f"Found code file {file.relative_to(root_path)}")
  414. code_file_path = file
  415. if code_file_path is not None:
  416. return code_file_path.relative_to(root_path)
  417. else:
  418. return None
  419. def create_folder_mapping(
  420. folder: str,
  421. extracted: bool = False
  422. ) -> None:
  423. """
  424. Create a mapping from 3 letter ISO country codes to folders
  425. based on the subfolders of the given folder. The mapping is
  426. stored in 'folder_mapping.json' in the given folder. Folder
  427. must be given relative to the repository root
  428. Parameters
  429. ----------
  430. folder: str
  431. folder to create the mapping for
  432. extracted: bool = False
  433. If true treat the folder as extracted data, where we
  434. only have one folder per country and no typos in the
  435. names
  436. Returns
  437. -------
  438. Nothing
  439. """
  440. folder = root_path / folder
  441. folder_mapping = {}
  442. #if not extracted:
  443. known_folders = custom_folders
  444. #else:
  445. # known_folders = {}
  446. for item in folder.iterdir():
  447. if item.is_dir() and not item.match("__pycache__"):
  448. if item.name in known_folders:
  449. ISO3 = known_folders[item.name]
  450. else:
  451. try:
  452. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  453. if len(country) > 1:
  454. ISO3 = None
  455. for current_country in country:
  456. if current_country.name == item.name.replace("_", " "):
  457. ISO3 = current_country.alpha_3
  458. else:
  459. ISO3 = country[0].alpha_3
  460. except:
  461. ISO3 = None
  462. if ISO3 is None:
  463. print(f"No match for {item.name}")
  464. else:
  465. if ISO3 in folder_mapping.keys():
  466. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  467. else:
  468. folder_mapping[ISO3] = item.name
  469. with open(folder / "folder_mapping.json", "w") as mapping_file:
  470. json.dump(folder_mapping, mapping_file, indent=4)