functions_temp.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """Temporary file for new functions to avoid merging issues due to different automatic formatting. Delete after merge."""
  2. import pandas as pd
  3. import warnings
  4. import numpy as np
  5. def find_and_replace_values(
  6. df: pd.DataFrame,
  7. replace_info: list[tuple[str | float]],
  8. category_column: str,
  9. entity_column: str = "entity",
  10. ) -> pd.DataFrame:
  11. """
  12. Find values and replace single values in a dataframe.
  13. Input
  14. -----
  15. df
  16. Input data frame
  17. replace_info
  18. Category, entity, year, and new value. Don't put a new value if you would like to replace with nan.
  19. For example [("3.C", "CO", "2019", 3.423)] or [("3.C", "CO", "2019")]
  20. category_column
  21. The name of the column that contains the categories.
  22. entity_column
  23. The name of the column that contains the categories.
  24. Output
  25. ------
  26. Data frame with updated values.
  27. """
  28. for replace_info_value in replace_info:
  29. category = replace_info_value[0]
  30. entity = replace_info_value[1]
  31. year = replace_info_value[2]
  32. if len(replace_info_value) == 4:
  33. new_value = replace_info_value[3]
  34. elif len(replace_info_value) == 3:
  35. new_value = np.nan
  36. else:
  37. raise AssertionError(
  38. f"Expected tuple of length 3 or 4. Got {replace_info_value}"
  39. )
  40. index = df.loc[
  41. (df[category_column] == category) & (df[entity_column] == entity),
  42. ].index[0]
  43. # pandas recommends using .at[] for changing single values
  44. df.at[index, year] = new_value
  45. print(f"Set value for {category}, {entity}, {year} to {new_value}.")
  46. return df
  47. def assert_values(
  48. df: pd.DataFrame,
  49. test_case: tuple[str | float | int],
  50. category_column: str = "category (IPCC1996_2006_GIN_Inv)",
  51. entity_column: str = "entity",
  52. ) -> None:
  53. """
  54. Check if a value in a dataframe matches the expected value.
  55. Input
  56. -----
  57. df
  58. The data frame to check.
  59. test_case
  60. The combination of parameters and the expected value.
  61. Use the format (<category>, <entity>, <year>, <expected_value>).
  62. category_column
  63. The columns where to look for the category.
  64. entity_column
  65. The column where to look for the entity.
  66. """
  67. category = test_case[0]
  68. entity = test_case[1]
  69. year = test_case[2]
  70. expected_value = test_case[3]
  71. assert isinstance(expected_value, (float, int)), "This function only works for numbers. Use assert_nan_values to check for NaNs and empty values."
  72. arr = df.loc[
  73. (df[category_column] == category) & (df[entity_column] == entity), year
  74. ].values
  75. # Assert the category exists in the data frame
  76. assert (
  77. category in df[category_column].unique()
  78. ), f"{category} is not a valid category. Choose from {df[category_column].unique()}"
  79. # Assert the entity exists in the data frame
  80. assert (
  81. entity in df[entity_column].unique()
  82. ), f"{entity} is not a valid entity. Choose from {df[entity_column].unique()}"
  83. assert (
  84. arr.size > 0
  85. ), f"No value found for category {category}, entity {entity}, year {year}!"
  86. assert (
  87. arr.size <= 1
  88. ), f"More than one value found for category {category}, entity {entity}, year {year}!"
  89. assert (
  90. arr[0] == test_case[3]
  91. ), f"Expected value {expected_value}, actual value is {arr[0]}"
  92. print(
  93. f"Value for category {category}, entity {entity}, year {year} is as expected."
  94. )
  95. def assert_nan_values(
  96. df: pd.DataFrame,
  97. test_case: tuple[str, ...],
  98. category_column: str = "category (IPCC1996_2006_GIN_Inv)",
  99. entity_column: str = "entity",
  100. ) -> None:
  101. """
  102. Check if values that are empty or NE or NE1 in the PDF tables
  103. are not present in the dataset.
  104. Input
  105. -----
  106. df
  107. The data frame to check.
  108. test_case
  109. The combination of input parameters.
  110. Use the format (<category>, <entity>, <year>).
  111. category_column
  112. The columns where to look for the category.
  113. entity_column
  114. The column where to look for the entity.
  115. """
  116. category = test_case[0]
  117. entity = test_case[1]
  118. year = test_case[2]
  119. if category not in df[category_column].unique():
  120. warning_string = f"{category} is not in the data set. Either all values for this category are NaN or the category never existed in the data set."
  121. warnings.warn(warning_string)
  122. return
  123. if entity not in df[entity_column].unique():
  124. warning_string = f"{entity} is not in the data set. Either all values for this entity are NaN or the category never existed in the data set."
  125. warnings.warn(warning_string)
  126. return
  127. arr = df.loc[
  128. (df[category_column] == category) & (df[entity_column] == entity), year
  129. ].values
  130. assert np.isnan(arr[0]), f"Value is {arr[0]} and not NaN."
  131. print(f"Value for category {category}, entity {entity}, year {year} is NaN.")