Source code for rsmtool.transformer
"""
Class for transforming features.
:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import logging
from typing import Callable, Dict, Optional
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr
[docs]
class FeatureTransformer:
"""Encapsulate feature transformation methods."""
def __init__(self, logger: Optional[logging.Logger] = None):
"""
Initialize the FeatureTransformer object.
Parameters
----------
logger : Optional[logging.Logger]
Logger object to use in the transformer. If not provided,
a logger will be created with the name of this class.
"""
self.logger = logger if logger else logging.getLogger(__name__)
[docs]
def apply_sqrt_transform(
self, name: str, values: np.ndarray, raise_error: bool = True
) -> np.ndarray:
"""
Apply the "sqrt" transform to ``values``.
Parameters
----------
name : str
Name of the feature to transform.
values : numpy.ndarray
Numpy array containing the feature values.
raise_error : bool
If ``True``, raises an error if the transform is applied to
a feature that has negative values.
Defaults to ``True``.
Returns
-------
new_data : numpy.ndarray
Numpy array containing the transformed feature values.
Raises
------
ValueError
If the transform is applied to a feature that has negative
values and ``raise_error`` is ``True``.
"""
# check if the feature has any negative values
if np.any(values < 0):
if raise_error:
raise ValueError(
f"The sqrt transformation should not be applied to "
f"feature {name} which can have negative values."
)
else:
self.logger.warning(
f"The sqrt transformation was applied to feature "
f"{name} which has negative values for some responses. "
f"No system score will be generated for such responses"
)
with np.errstate(invalid="ignore"):
new_data = np.sqrt(values)
return new_data
[docs]
def apply_log_transform(
self, name: str, values: np.ndarray, raise_error: bool = True
) -> np.ndarray:
"""
Apply the "log" transform to ``values``.
Parameters
----------
name : str
Name of the feature to transform.
values : numpy.ndarray
Numpy array containing the feature values.
raise_error : bool
If ``True``, raises an error if the transform is applied to
a feature that has zero or negative values.
Defaults to ``True``.
Returns
-------
new_data : numpy.ndarray
Numpy array containing the transformed feature values.
Raises
------
ValueError
If the transform is applied to a feature that has
zero or negative values and ``raise_error`` is ``True``.
"""
# check if the feature has any zeros
if np.any(values == 0):
if raise_error:
raise ValueError(
f"The log transformation should not be applied to "
f"feature {name} which can have a value of 0."
)
else:
self.logger.warning(
f"The log transformation was applied to feature "
f"{name} which has a value of 0 for some responses. "
f"No system score will be generated for such responses."
)
# check if the feature has any negative values
if np.any(values < 0):
if raise_error:
raise ValueError(
f"The log transformation should not be applied to "
f"feature {name} which can have negative values."
)
else:
self.logger.warning(
f"The log transformation was applied to feature "
f"{name} which has negative values for some responses. "
f"No system score will be generated for such responses"
)
new_data = np.log(values)
return new_data
[docs]
def apply_inverse_transform(
self, name: str, values: np.ndarray, raise_error: bool = True, sd_multiplier: int = 4
) -> np.ndarray:
"""
Apply the "inv" (inverse) transform to ``values``.
Parameters
----------
name : str
Name of the feature to transform.
values : numpy.ndarray
Numpy array containing the feature values.
raise_error : bool
If ``True``, raises an error if the transform is applied to a feature
that has zero values or to a feature that has both positive and
negative values.
Defaults to ``True``.
sd_multiplier : int
Use this std. dev. multiplier to compute the ceiling and floor for
outlier removal and check that these are not equal to zero.
Defaults to 4.
Returns
-------
new_data : numpy.ndarray
Numpy array containing the transformed feature values.
Raises
------
ValueError
If the transform is applied to a feature that is
zero or to a feature that can have different
signs, and ``raise_error`` is ``True``.
"""
if np.any(values == 0):
if raise_error:
raise ValueError(
f"The inverse transformation should not be applied "
f"to feature {name} which can have a value of 0."
)
else:
self.logger.warning(
f"The inverse transformation was applied to feature "
f"{name} which has a value of 0 for some responses. "
f"No system score will be generated for such responses."
)
# check if the floor or ceiling are zero
data_mean = np.mean(values)
data_sd = np.std(values, ddof=1)
floor = data_mean - sd_multiplier * data_sd
ceiling = data_mean + sd_multiplier * data_sd
if floor == 0 or ceiling == 0:
self.logger.warning(
f"The floor/ceiling for feature {name} is zero after "
f"applying the inverse transformation."
)
# check if the feature can be both positive and negative
all_positive = np.all(np.abs(values) == values)
all_negative = np.all(np.abs(values) == -values)
if not (all_positive or all_negative):
if raise_error:
raise ValueError(
f"The inverse transformation should not be applied "
f"to feature {name} where the values can have different signs"
)
else:
self.logger.warning(
f"The inverse transformation was applied to feature "
f"{name} where the values can have different signs. "
f"This can change the ranking of the responses."
)
with np.errstate(divide="ignore"):
new_data = 1 / values
return new_data
[docs]
def apply_add_one_inverse_transform(
self, name: str, values: np.ndarray, raise_error: bool = True
) -> np.ndarray:
"""
Apply the "addOneInv" (add one and invert) transform to ``values``.
Parameters
----------
name : str
Name of the feature to transform.
values : numpy.ndarray
Numpy array containing the feature values.
raise_error : bool
If ``True``, raises an error if the transform is applied to
a feature that has zero or negative values.
Defaults to ``True``.
Returns
-------
new_data : numpy.ndarray
Numpy array containing the transformed feature values.
Raises
------
ValueError
If the transform is applied to a feature that
has negative values and ``raise_error`` is ``True``.
"""
# check if the feature has any negative values
if np.any(values < 0):
if raise_error:
raise ValueError(
f"The addOneInv transformation should not be applied "
f"to feature {name} which can have negative values."
)
else:
self.logger.warning(
f"The addOneInv transformation was applied to "
f"feature {name} which has negative values for "
f"some responses. This can change the ranking of "
f"the responses."
)
new_data = 1 / (values + 1)
return new_data
[docs]
def apply_add_one_log_transform(
self, name: str, values: np.ndarray, raise_error: bool = True
) -> np.ndarray:
"""
Apply the "addOneLn" (add one and log) transform to ``values``.
Parameters
----------
name : str
Name of the feature to transform.
values : numpy.ndarray
Numpy array containing the feature values.
raise_error : bool
If ``True``, raises an error if the transform is applied to
a feature that has zero or negative values.
Defaults to ``True``.
Returns
-------
new_data : numpy.ndarray
Numpy array that contains the transformed feature values.
Raises
------
ValueError
If the transform is applied to a feature that
has negative values and ``raise_error`` is ``True``.
"""
# check if the feature has any negative values
if np.any(values < 0):
if raise_error:
raise ValueError(
f"The addOneLn transformation should not be applied "
f"to feature {name} which can have negative values."
)
else:
self.logger.warning(
f"The log transformation was applied to feature "
f"{name} which has negative values for some responses. "
f"If the feature value remains negative after adding one, "
f"no score will be generated for such responses."
)
new_data = np.log(values + 1)
return new_data
[docs]
def transform_feature(
self, values: np.ndarray, column_name: str, transform: str, raise_error: bool = True
) -> np.ndarray:
"""
Apply given transform to all values in the given numpy array.
The values are assumed to be for the feature with the given name.
Parameters
----------
values : numpy.ndarray
Numpy array containing the feature values.
column_name : str
Name of the feature to transform.
transform : str
Name of the transform to apply. One of {``"inv"``, ``"sqrt"``,
``"log"``, ``"addOneInv"``, ``"addOneLn"``, ``"raw"``, ``"org"``}.
raise_error : bool
If ``True``, raise a ValueError if a transformation leads to
invalid values or may change the ranking of the responses.
Defaults to ``True``.
Returns
-------
new_data : numpy.ndarray
Numpy array containing the transformed feature values.
Raises
------
ValueError
If the given transform is not recognized.
Note
----
Many of these transformations may be meaningless for features which
span both negative and positive values. Some transformations may
throw errors for negative feature values.
"""
transforms: Dict[str, Callable] = {
"inv": self.apply_inverse_transform,
"sqrt": self.apply_sqrt_transform,
"log": self.apply_log_transform,
"addOneInv": self.apply_add_one_inverse_transform,
"addOneLn": self.apply_add_one_log_transform,
"raw": lambda data: data,
"org": lambda data: data,
}
# make sure we have a valid transform function
if transform is None or transform not in transforms:
raise ValueError(f"Unrecognized feature transformation: {transform}")
transformer = transforms[transform]
args = [column_name, values, raise_error] if transform not in ["raw", "org"] else [values]
new_data = transformer(*args)
return new_data
[docs]
def find_feature_transform(
self, feature_name: str, feature_value: pd.Series, scores: pd.Series
) -> str:
"""
Identify best transformation for feature given correlation with score.
The best transformation is chosen based on the absolute Pearson
correlation with human score.
Parameters
----------
feature_name: str
Name of feature for which to find the transformation.
feature_value: pandas.Series
Series containing feature values.
scores: pandas.Series
Numeric human scores.
Returns
-------
best_transformation: str
The name of the transformation which gives the highest correlation
between the feature values and the human scores. See
:ref:`documentation <select_transformations_rsmtool>` for the
full list of transformations.
"""
# Do not use sqrt and ln for potential negative features.
# Do not use inv for positive features.
if any(feature_value < 0):
applicable_transformations = ["org", "inv"]
else:
applicable_transformations = ["org", "sqrt", "addOneInv", "addOneLn"]
correlations = []
for trans in applicable_transformations:
try:
transformed_value = self.transform_feature(feature_value, feature_name, trans)
correlations.append(abs(pearsonr(transformed_value, scores)[0]))
except ValueError:
# If the transformation returns an error, append 0.
correlations.append(0)
best = np.argmax(correlations)
best_transformation = applicable_transformations[best]
return best_transformation