UNFCCC_CRF_reader_devel.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. """
  2. This file holds functions that are used in CRF reading development like
  3. adding new tables or new submission years (and according country specific
  4. categories). Thue functions are tailored towards debug output and reading
  5. of single years in contrast to the production functions which are tailored
  6. towards the creation of full datasets including storage in the
  7. """
  8. import pandas as pd
  9. from typing import List
  10. from pathlib import Path
  11. def save_unknown_categories_info(
  12. unknown_categories: List[List],
  13. file: Path,
  14. ) -> None:
  15. """
  16. Save information on unknown categories to a csv file.
  17. Parameters
  18. __________
  19. unknown_categories: List[List]
  20. List of lists with information on the unknown categories.
  21. (which table, country and year, and which categories)
  22. file: pathlib.Path
  23. File including path where the data should be stored
  24. """
  25. # process unknown categories
  26. df_unknown_cats = pd.DataFrame(unknown_categories, columns=["Table", "Country", "Category", "Year"])
  27. processed_cats = []
  28. all_tables = df_unknown_cats["Table"].unique()
  29. all_years = set(df_unknown_cats["Year"].unique())
  30. all_years = set([year for year in all_years if isinstance(year, int)])
  31. all_years = set([year for year in all_years if int(year) > 1989])
  32. for table in all_tables:
  33. df_cats_current_table = df_unknown_cats[df_unknown_cats["Table"] == table]
  34. cats_current_table = list(df_cats_current_table["Category"].unique())
  35. for cat in cats_current_table:
  36. df_current_cat_table = df_cats_current_table[df_cats_current_table["Category"] == cat]
  37. all_countries = df_current_cat_table["Country"].unique()
  38. countries_cat = ""
  39. for country in all_countries:
  40. years_country = df_current_cat_table[df_current_cat_table["Country"] == country]["Year"].unique()
  41. if set(years_country) == all_years:
  42. countries_cat = f"{countries_cat}; {country}"
  43. else:
  44. countries_cat = f"{countries_cat}; {country} ({years_country})"
  45. processed_cats.append([table, cat, countries_cat])
  46. folder = file.parents[0]
  47. if not folder.exists:
  48. folder.mkdir()
  49. df_processed_cats = pd.DataFrame(processed_cats, columns=["Table", "Category", "Countries"])
  50. df_processed_cats.to_csv(file, index=False)
  51. def save_last_row_info(
  52. last_row_info: List[List],
  53. file: Path,
  54. ) -> None:
  55. """
  56. Save information on data found in the last row read for a table.
  57. The last row read should not contain data. If it does contain data
  58. it is a hint that table size is larger for some countries than
  59. given in the specification and thus we might not read the full table.
  60. Parameters
  61. __________
  62. last_row_info: List[List]
  63. List of lists with information on the unknown categories.
  64. (which table, country and year, and which categories)
  65. file: pathlib.Path
  66. File including path where the data should be stored
  67. """
  68. # process last row with information messages
  69. df_last_row_info = pd.DataFrame(last_row_info, columns=["Table", "Country", "Category", "Year"])
  70. processed_last_row_info = []
  71. all_tables = df_last_row_info["Table"].unique()
  72. all_years = set(df_last_row_info["Year"].unique())
  73. all_years = set([year for year in all_years if isinstance(year, int)])
  74. all_years = set([year for year in all_years if year > 1989])
  75. for table in all_tables:
  76. df_last_row_current_table = df_last_row_info[df_last_row_info["Table"] == table]
  77. all_countries = df_last_row_current_table["Country"].unique()
  78. for country in all_countries:
  79. df_current_country_table = df_last_row_current_table[df_last_row_current_table["Country"] == country]
  80. all_categories = df_current_country_table["Category"].unique()
  81. cats_country = ""
  82. for cat in all_categories:
  83. years_category = df_current_country_table[df_current_country_table["Category"] == cat]["Year"].unique()
  84. if set(years_category) == all_years:
  85. cats_country = f"{cats_country}; {cat}"
  86. else:
  87. cats_country = f"{cats_country}; {cat} ({years_category})"
  88. processed_last_row_info.append([table, country, cats_country])
  89. folder = file.parents[0]
  90. if not folder.exists:
  91. folder.mkdir()
  92. df_processed_lost_row_info = pd.DataFrame(processed_last_row_info, columns=["Table", "Country", "Categories"])
  93. df_processed_lost_row_info.to_csv("test_last_row_info.csv", index=False)