123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- """Temporary file for new functions to avoid merging issues due to different automatic formatting. Delete after merge."""
- import pandas as pd
- import warnings
- import numpy as np
- def find_and_replace_values(
- df: pd.DataFrame,
- replace_info: list[tuple[str | float]],
- category_column: str,
- entity_column: str = "entity",
- ) -> pd.DataFrame:
- """
- Find values and replace single values in a dataframe.
- Input
- -----
- df
- Input data frame
- replace_info
- Category, entity, year, and new value. Don't put a new value if you would like to replace with nan.
- For example [("3.C", "CO", "2019", 3.423)] or [("3.C", "CO", "2019")]
- category_column
- The name of the column that contains the categories.
- entity_column
- The name of the column that contains the categories.
- Output
- ------
- Data frame with updated values.
- """
- for replace_info_value in replace_info:
- category = replace_info_value[0]
- entity = replace_info_value[1]
- year = replace_info_value[2]
- if len(replace_info_value) == 4:
- new_value = replace_info_value[3]
- elif len(replace_info_value) == 3:
- new_value = np.nan
- else:
- raise AssertionError(
- f"Expected tuple of length 3 or 4. Got {replace_info_value}"
- )
- index = df.loc[
- (df[category_column] == category) & (df[entity_column] == entity),
- ].index[0]
- # pandas recommends using .at[] for changing single values
- df.at[index, year] = new_value
- print(f"Set value for {category}, {entity}, {year} to {new_value}.")
- return df
- def assert_values(
- df: pd.DataFrame,
- test_case: tuple[str | float | int],
- category_column: str = "category (IPCC1996_2006_GIN_Inv)",
- entity_column: str = "entity",
- ) -> None:
- """
- Check if a value in a dataframe matches the expected value.
- Input
- -----
- df
- The data frame to check.
- test_case
- The combination of parameters and the expected value.
- Use the format (<category>, <entity>, <year>, <expected_value>).
- category_column
- The columns where to look for the category.
- entity_column
- The column where to look for the entity.
- """
- category = test_case[0]
- entity = test_case[1]
- year = test_case[2]
- expected_value = test_case[3]
- assert isinstance(expected_value, (float, int)), "This function only works for numbers. Use assert_nan_values to check for NaNs and empty values."
- arr = df.loc[
- (df[category_column] == category) & (df[entity_column] == entity), year
- ].values
- # Assert the category exists in the data frame
- assert (
- category in df[category_column].unique()
- ), f"{category} is not a valid category. Choose from {df[category_column].unique()}"
- # Assert the entity exists in the data frame
- assert (
- entity in df[entity_column].unique()
- ), f"{entity} is not a valid entity. Choose from {df[entity_column].unique()}"
- assert (
- arr.size > 0
- ), f"No value found for category {category}, entity {entity}, year {year}!"
- assert (
- arr.size <= 1
- ), f"More than one value found for category {category}, entity {entity}, year {year}!"
- assert (
- arr[0] == test_case[3]
- ), f"Expected value {expected_value}, actual value is {arr[0]}"
- print(
- f"Value for category {category}, entity {entity}, year {year} is as expected."
- )
- def assert_nan_values(
- df: pd.DataFrame,
- test_case: tuple[str, ...],
- category_column: str = "category (IPCC1996_2006_GIN_Inv)",
- entity_column: str = "entity",
- ) -> None:
- """
- Check if values that are empty or NE or NE1 in the PDF tables
- are not present in the dataset.
- Input
- -----
- df
- The data frame to check.
- test_case
- The combination of input parameters.
- Use the format (<category>, <entity>, <year>).
- category_column
- The columns where to look for the category.
- entity_column
- The column where to look for the entity.
- """
- category = test_case[0]
- entity = test_case[1]
- year = test_case[2]
- if category not in df[category_column].unique():
- warning_string = f"{category} is not in the data set. Either all values for this category are NaN or the category never existed in the data set."
- warnings.warn(warning_string)
- return
- if entity not in df[entity_column].unique():
- warning_string = f"{entity} is not in the data set. Either all values for this entity are NaN or the category never existed in the data set."
- warnings.warn(warning_string)
- return
- arr = df.loc[
- (df[category_column] == category) & (df[entity_column] == entity), year
- ].values
- assert np.isnan(arr[0]), f"Value is {arr[0]} and not NaN."
- print(f"Value for category {category}, entity {entity}, year {year} is NaN.")
|