get_submissions_info.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. # helper functions to get information on available submissions
  2. # and data reading functions for a given country
  3. from typing import List, Dict
  4. from pathlib import Path
  5. import json
  6. import pycountry
  7. #import os
  8. root_path = Path(__file__).parents[2].absolute()
  9. root_path = root_path.resolve()
  10. code_path = root_path / "code"
  11. # beware, folders below are different than for CRF reader
  12. downloaded_data_path = root_path / "downloaded_data"
  13. extracted_data_path = root_path / "extracted_data"
  14. legacy_data_path = root_path / "legacy_data"
  15. # TODO: move this to general util package
  16. custom_country_mapping = {
  17. "EUA": "European Union",
  18. "EUC": "European Union",
  19. "FRK": "France",
  20. "DKE": "Denmark",
  21. "DNM": "Denmark",
  22. "GBK": "United Kingdom of Great Britain and Northern Ireland",
  23. }
  24. def get_country_submissions(
  25. country_name: str,
  26. print_sub: bool = True,
  27. ) -> Dict[str, List[str]]:
  28. """
  29. Input is a three letter ISO code for a country, or the countries name.
  30. The function tries to map the country name to an ISO code and then
  31. queries the folder mapping files for folders.
  32. Parameters
  33. ----------
  34. country_name: str
  35. String containing the country name or ISO 3 letter code
  36. print_sub: bool
  37. If True information on submissions will be written to stdout
  38. Returns
  39. -------
  40. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  41. Each value is a list of folders
  42. """
  43. data_folder = downloaded_data_path
  44. country_code = get_country_code(country_name)
  45. if print_sub:
  46. print(f"Country name {country_name} maps to ISO code {country_code}")
  47. country_submissions = {}
  48. if print_sub:
  49. print(f"#" * 80)
  50. print(f"The following submissions are available for {country_name}")
  51. for item in data_folder.iterdir():
  52. if item.is_dir():
  53. if print_sub:
  54. print("")
  55. print("-" * 80)
  56. print(f"Data folder {item.name}")
  57. print("-" * 80)
  58. with open(item / "folder_mapping.json", "r") as mapping_file:
  59. folder_mapping = json.load(mapping_file)
  60. if country_code in folder_mapping:
  61. country_folders = folder_mapping[country_code]
  62. if isinstance(country_folders, str):
  63. # only one folder
  64. country_folders = [country_folders]
  65. submission_folders = []
  66. for country_folder in country_folders:
  67. current_folder = item / country_folder
  68. if print_sub:
  69. print(f"Submissions in folder {country_folder}:")
  70. for submission_folder in current_folder.iterdir():
  71. if submission_folder.is_dir():
  72. if print_sub:
  73. print(submission_folder.name)
  74. submission_folders.append(submission_folder.name)
  75. country_submissions[item.name] = submission_folders
  76. else:
  77. print(f"No submissions available for {country_name}.")
  78. return country_submissions
  79. def get_country_datasets(
  80. country_name: str,
  81. print_ds: bool = True,
  82. ) -> Dict[str, List[str]]:
  83. """
  84. Input is a three letter ISO code for a country, or the country's name.
  85. The function tries to map the country name to an ISO code and then
  86. checks the code and data folders for content on the country.
  87. Parameters
  88. ----------
  89. country_name: str
  90. String containing the country name or ISO 3 letter code
  91. print_ds: bool
  92. If True information on submissions will be written to stdout
  93. Returns
  94. -------
  95. returns a dict with keys for the dataset classes (e.g. UNFCCC, non-UNFCCC)
  96. Each value is a list of folders
  97. """
  98. data_folder = extracted_data_path
  99. data_folder_legacy = legacy_data_path
  100. # obtain country code
  101. country_code = get_country_code(country_name)
  102. if print_ds:
  103. print(f"Country name {country_name} maps to ISO code {country_code}")
  104. rep_data = {}
  105. # data
  106. if print_ds:
  107. print(f"#" * 80)
  108. print(f"The following datasets are available for {country_name}")
  109. for item in data_folder.iterdir():
  110. if item.is_dir():
  111. cleaned_datasets_current_folder = {}
  112. if print_ds:
  113. print("-" * 80)
  114. print(f"Data folder {item.name}")
  115. print("-" * 80)
  116. with open(item / "folder_mapping.json", "r") as mapping_file:
  117. folder_mapping = json.load(mapping_file)
  118. if country_code not in folder_mapping:
  119. if print_ds:
  120. print("No data available")
  121. print("")
  122. else:
  123. country_folder = folder_mapping[country_code]
  124. if not isinstance(country_folder, str):
  125. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  126. datasets_current_folder = {}
  127. current_folder = item / country_folder
  128. for data_file in current_folder.iterdir():
  129. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  130. if data_file.stem in datasets_current_folder:
  131. datasets_current_folder[data_file.stem].append(data_file.suffix)
  132. else:
  133. datasets_current_folder[data_file.stem] = [data_file.suffix]
  134. for dataset in datasets_current_folder:
  135. # process filename to get submission
  136. parts = dataset.split('_')
  137. if parts[0] != country_code:
  138. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  139. else:
  140. terminology = "_".join(parts[3 : ])
  141. key = f"{parts[1]} ({parts[2]}, {terminology})"
  142. data_info = ""
  143. if '.nc' in datasets_current_folder[dataset]:
  144. data_info = data_info + "NF (.nc), "
  145. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  146. data_info = data_info + "IF (.yaml + .csv), "
  147. elif '.csv' in datasets_current_folder[dataset]:
  148. data_info = data_info + "incomplete IF? (.csv), "
  149. elif '.yaml' in datasets_current_folder[dataset]:
  150. data_info = data_info + "incomplete IF (.yaml), "
  151. code_file = get_code_file(country_code, parts[1])
  152. if code_file:
  153. data_info = data_info + f"code: {code_file.name}"
  154. else:
  155. data_info = data_info + f"code: not found"
  156. cleaned_datasets_current_folder[key] = data_info
  157. if print_ds:
  158. if cleaned_datasets_current_folder:
  159. for country_ds in cleaned_datasets_current_folder:
  160. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  161. else:
  162. print("No data available")
  163. print("")
  164. rep_data[item.name] = cleaned_datasets_current_folder
  165. # legacy data
  166. if print_ds:
  167. print(f"#" * 80)
  168. print(f"The following legacy datasets are available for {country_name}")
  169. legacy_data = {}
  170. for item in data_folder_legacy.iterdir():
  171. if item.is_dir():
  172. cleaned_datasets_current_folder = {}
  173. if print_ds:
  174. print("-" * 80)
  175. print(f"Data folder {item.name}")
  176. print("-" * 80)
  177. with open(item / "folder_mapping.json", "r") as mapping_file:
  178. folder_mapping = json.load(mapping_file)
  179. if country_code not in folder_mapping:
  180. if print_ds:
  181. print("No data available")
  182. print("")
  183. else:
  184. country_folder = folder_mapping[country_code]
  185. if not isinstance(country_folder, str):
  186. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  187. datasets_current_folder = {}
  188. current_folder = item / country_folder
  189. for data_file in current_folder.iterdir():
  190. if data_file.suffix in ['.nc', '.yaml', '.csv']:
  191. if data_file.stem in datasets_current_folder:
  192. datasets_current_folder[data_file.stem].append(data_file.suffix)
  193. else:
  194. datasets_current_folder[data_file.stem] = [data_file.suffix]
  195. for dataset in datasets_current_folder:
  196. # process filename to get submission
  197. parts = dataset.split('_')
  198. if parts[0] != country_code:
  199. cleaned_datasets_current_folder[f'Wrong code: {parts[0]}'] = dataset
  200. else:
  201. terminology = "_".join(parts[3 : ])
  202. key = f"{parts[1]} ({parts[2]}, {terminology}, legacy)"
  203. data_info = ""
  204. if '.nc' in datasets_current_folder[dataset]:
  205. data_info = data_info + "NF (.nc), "
  206. if ('.csv' in datasets_current_folder[dataset]) and ('.yaml' in datasets_current_folder[dataset]):
  207. data_info = data_info + "IF (.yaml + .csv), "
  208. elif '.csv' in datasets_current_folder[dataset]:
  209. data_info = data_info + "incomplete IF? (.csv), "
  210. elif '.yaml' in datasets_current_folder[dataset]:
  211. data_info = data_info + "incomplete IF (.yaml), "
  212. cleaned_datasets_current_folder[key] = data_info
  213. if print_ds:
  214. if cleaned_datasets_current_folder:
  215. for country_ds in cleaned_datasets_current_folder:
  216. print(f"{country_ds}: {cleaned_datasets_current_folder[country_ds]}")
  217. else:
  218. print("No data available")
  219. print("")
  220. legacy_data[item.name] = cleaned_datasets_current_folder
  221. all_data = {
  222. "rep_data": rep_data,
  223. "legacy_data": legacy_data,
  224. }
  225. return all_data
  226. def get_country_code(
  227. country_name: str,
  228. )->str:
  229. """
  230. obtain country code. If the input is a code it will be returned, if the input
  231. is not a three letter code a search will be performed
  232. Parameters
  233. __________
  234. country_name: str
  235. Country code or name to get the three-letter code for.
  236. """
  237. try:
  238. # check if it's a 3 letter code
  239. country = pycountry.countries.get(alpha_3=country_name)
  240. country_code = country.alpha_3
  241. except:
  242. try:
  243. country = pycountry.countries.search_fuzzy(country_name)
  244. except:
  245. raise ValueError(f"Country name {country_name} can not be mapped to "
  246. f"any country code")
  247. if len(country) > 1:
  248. country_code = None
  249. for current_country in country:
  250. if current_country.name == country_name:
  251. country_code = current_country.alpha_3
  252. if country_code is None:
  253. raise ValueError(f"Country name {country_name} has {len(country)} "
  254. f"possible results for country codes.")
  255. country_code = country[0].alpha_3
  256. return country_code
  257. def get_possible_inputs(
  258. country_name: str,
  259. submission: str,
  260. print_info: bool = False,
  261. ) -> List[Path]:
  262. """
  263. For given country name and submission find the possible input files
  264. Parameters
  265. ----------
  266. country_name: str
  267. String containing the country name or ISO 3 letter code
  268. submission: str
  269. String of the submission
  270. print_info: bool = False
  271. If True print information on code found
  272. Returns
  273. -------
  274. returns a list pathlib Path objects for the input files
  275. """
  276. data_folder = downloaded_data_path
  277. # obtain country code
  278. country_code = get_country_code(country_name)
  279. if print_info:
  280. print(f"Country name {country_name} maps to ISO code {country_code}")
  281. input_files = []
  282. for item in data_folder.iterdir():
  283. if item.is_dir():
  284. with open(item / "folder_mapping.json", "r") as mapping_file:
  285. folder_mapping = json.load(mapping_file)
  286. if country_code in folder_mapping:
  287. country_folders = folder_mapping[country_code]
  288. if isinstance(country_folders, str):
  289. # only one folder
  290. country_folders = [country_folders]
  291. for country_folder in country_folders:
  292. input_folder = item / country_folder / submission
  293. if input_folder.exists():
  294. for filepath in input_folder.glob("*"):
  295. input_files.append(filepath.relative_to(root_path))
  296. if print_info:
  297. if input_files:
  298. print(f"Found possible input files:")
  299. for file in input_files:
  300. print(file)
  301. else:
  302. print(f"No input files found")
  303. return input_files
  304. def get_possible_outputs(
  305. country_name: str,
  306. submission: str,
  307. print_info: bool = False,
  308. )-> List[Path]:
  309. """
  310. For given country name and submission find the possible output files
  311. Parameters
  312. ----------
  313. country_name: str
  314. String containing the country name or ISO 3 letter code
  315. submission: str
  316. String of the submission
  317. print_info: bool = False
  318. If True print information on outputs found
  319. Returns
  320. -------
  321. returns a list pathlib Path objects for the input files
  322. """
  323. data_folder = extracted_data_path
  324. # obtain country code
  325. country_code = get_country_code(country_name)
  326. if print_info:
  327. print(f"Country name {country_name} maps to ISO code {country_code}")
  328. output_files = []
  329. for item in data_folder.iterdir():
  330. if item.is_dir():
  331. with open(item / "folder_mapping.json", "r") as mapping_file:
  332. folder_mapping = json.load(mapping_file)
  333. if country_code in folder_mapping:
  334. country_folder = folder_mapping[country_code]
  335. if not isinstance(country_folder, str):
  336. raise ValueError("Wrong data type in folder mapping json file. Should be str.")
  337. output_folder = item / country_folder
  338. if output_folder.exists():
  339. for filepath in output_folder.glob(country_code + "_" + submission + "*"):
  340. output_files.append(filepath.relative_to(root_path))
  341. if print_info:
  342. if output_files:
  343. print(f"Found possible output files:")
  344. for file in output_files:
  345. print(file)
  346. else:
  347. print(f"No output files found")
  348. return output_files
  349. def get_code_file(
  350. country_name: str,
  351. submission: str,
  352. print_info: bool = False,
  353. ) -> Path:
  354. """
  355. For given country name and submission find the script that creates the data
  356. Parameters
  357. ----------
  358. country_name: str
  359. String containing the country name or ISO 3 letter code
  360. submission: str
  361. String of the submission
  362. print_info: bool = False
  363. If True print information on code found
  364. Returns
  365. -------
  366. returns a pathlib Path object for the code file
  367. """
  368. code_file_path = None
  369. # CRF is an exception as it's read using the UNFCCC_CRF_reader module
  370. # so we return the path to that.
  371. if submission[0:3] == "CRF":
  372. return root_path / "UNFCCC_CRF_reader"
  373. # obtain country code
  374. country_code = get_country_code(country_name)
  375. if print_info:
  376. print(f"Country name {country_name} maps to ISO code {country_code}")
  377. with open(code_path / "folder_mapping.json", "r") as mapping_file:
  378. folder_mapping = json.load(mapping_file)
  379. if country_code not in folder_mapping:
  380. if print_info:
  381. print("No code available")
  382. print("")
  383. else:
  384. country_folder = code_path / folder_mapping[country_code]
  385. code_file_name_candidate = "read_" + country_code + "_" + submission + "*"
  386. for file in country_folder.iterdir():
  387. if file.match(code_file_name_candidate):
  388. if code_file_path is not None:
  389. raise ValueError(f"Found multiple code candidates: "
  390. f"{code_file_path} and file.name. "
  391. f"Please use only one file with name "
  392. f"'read_ISO3_submission_XXX.YYY'.")
  393. else:
  394. if print_info:
  395. print(f"Found code file {file.relative_to(root_path)}")
  396. code_file_path = file
  397. if code_file_path is not None:
  398. return code_file_path.relative_to(root_path)
  399. else:
  400. return None
  401. def create_folder_mapping(
  402. folder: str,
  403. extracted: bool = False
  404. ) -> None:
  405. """
  406. Create a mapping from 3 letter ISO country codes to folders
  407. based on the subfolders of the given folder. The mapping is
  408. stored in 'folder_mapping.json' in the given folder. Folder
  409. must be given relative to the repository root
  410. Parameters
  411. ----------
  412. folder: str
  413. folder to create the mapping for
  414. extracted: bool = False
  415. If true treat the folder as extracted data, where we
  416. only have one folder per country and no typos in the
  417. names
  418. Returns
  419. -------
  420. Nothing
  421. """
  422. folder = root_path / folder
  423. folder_mapping = custom_country_mapping
  424. if not extracted:
  425. folder_mapping = {
  426. **folder_mapping,
  427. **{
  428. 'VEN': 'Venezeula_(Bolivarian_Republic_of)',
  429. 'FSM': 'Micronesia_(Federated_State_of)',
  430. 'MKD': 'The_Republic_of_North_Macedonia',
  431. }
  432. }
  433. known_folders = list(folder_mapping.values())
  434. print(f"known_folders: {known_folders}")
  435. for item in folder.iterdir():
  436. if item.is_dir():
  437. try:
  438. country = pycountry.countries.search_fuzzy(item.name.replace("_", " "))
  439. if len(country) > 1:
  440. ISO3 = None
  441. for current_country in country:
  442. if current_country.name == item.name.replace("_", " "):
  443. ISO3 = current_country.alpha_3
  444. else:
  445. ISO3 = country[0].alpha_3
  446. except:
  447. ISO3 = None
  448. if ISO3 is None:
  449. if item.name not in known_folders:
  450. print(folder_mapping.values())
  451. print(f"No match for {item.name}")
  452. else:
  453. known_folders.append(item.name)
  454. if ISO3 in folder_mapping.keys():
  455. folder_mapping[ISO3] = [folder_mapping[ISO3], item.name]
  456. else:
  457. folder_mapping[ISO3] = item.name
  458. with open(folder / "folder_mapping.json", "w") as mapping_file:
  459. json.dump(folder_mapping, mapping_file, indent=4)