Source code for rsmtool.transformer

"""
Class for transforming features.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import logging
from typing import Callable, Dict, Optional

import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr


[docs] class FeatureTransformer: """Encapsulate feature transformation methods.""" def __init__(self, logger: Optional[logging.Logger] = None): """ Initialize the FeatureTransformer object. Parameters ---------- logger : Optional[logging.Logger] Logger object to use in the transformer. If not provided, a logger will be created with the name of this class. """ self.logger = logger if logger else logging.getLogger(__name__)
[docs] def apply_sqrt_transform( self, name: str, values: np.ndarray, raise_error: bool = True ) -> np.ndarray: """ Apply the "sqrt" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : numpy.ndarray Numpy array containing the feature values. raise_error : bool If ``True``, raises an error if the transform is applied to a feature that has negative values. Defaults to ``True``. Returns ------- new_data : numpy.ndarray Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The sqrt transformation should not be applied to " f"feature {name} which can have negative values." ) else: self.logger.warning( f"The sqrt transformation was applied to feature " f"{name} which has negative values for some responses. " f"No system score will be generated for such responses" ) with np.errstate(invalid="ignore"): new_data = np.sqrt(values) return new_data
[docs] def apply_log_transform( self, name: str, values: np.ndarray, raise_error: bool = True ) -> np.ndarray: """ Apply the "log" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : numpy.ndarray Numpy array containing the feature values. raise_error : bool If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : numpy.ndarray Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has zero or negative values and ``raise_error`` is ``True``. """ # check if the feature has any zeros if np.any(values == 0): if raise_error: raise ValueError( f"The log transformation should not be applied to " f"feature {name} which can have a value of 0." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has a value of 0 for some responses. " f"No system score will be generated for such responses." ) # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The log transformation should not be applied to " f"feature {name} which can have negative values." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has negative values for some responses. " f"No system score will be generated for such responses" ) new_data = np.log(values) return new_data
[docs] def apply_inverse_transform( self, name: str, values: np.ndarray, raise_error: bool = True, sd_multiplier: int = 4 ) -> np.ndarray: """ Apply the "inv" (inverse) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : numpy.ndarray Numpy array containing the feature values. raise_error : bool If ``True``, raises an error if the transform is applied to a feature that has zero values or to a feature that has both positive and negative values. Defaults to ``True``. sd_multiplier : int Use this std. dev. multiplier to compute the ceiling and floor for outlier removal and check that these are not equal to zero. Defaults to 4. Returns ------- new_data : numpy.ndarray Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that is zero or to a feature that can have different signs, and ``raise_error`` is ``True``. """ if np.any(values == 0): if raise_error: raise ValueError( f"The inverse transformation should not be applied " f"to feature {name} which can have a value of 0." ) else: self.logger.warning( f"The inverse transformation was applied to feature " f"{name} which has a value of 0 for some responses. " f"No system score will be generated for such responses." ) # check if the floor or ceiling are zero data_mean = np.mean(values) data_sd = np.std(values, ddof=1) floor = data_mean - sd_multiplier * data_sd ceiling = data_mean + sd_multiplier * data_sd if floor == 0 or ceiling == 0: self.logger.warning( f"The floor/ceiling for feature {name} is zero after " f"applying the inverse transformation." ) # check if the feature can be both positive and negative all_positive = np.all(np.abs(values) == values) all_negative = np.all(np.abs(values) == -values) if not (all_positive or all_negative): if raise_error: raise ValueError( f"The inverse transformation should not be applied " f"to feature {name} where the values can have different signs" ) else: self.logger.warning( f"The inverse transformation was applied to feature " f"{name} where the values can have different signs. " f"This can change the ranking of the responses." ) with np.errstate(divide="ignore"): new_data = 1 / values return new_data
[docs] def apply_add_one_inverse_transform( self, name: str, values: np.ndarray, raise_error: bool = True ) -> np.ndarray: """ Apply the "addOneInv" (add one and invert) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : numpy.ndarray Numpy array containing the feature values. raise_error : bool If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : numpy.ndarray Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The addOneInv transformation should not be applied " f"to feature {name} which can have negative values." ) else: self.logger.warning( f"The addOneInv transformation was applied to " f"feature {name} which has negative values for " f"some responses. This can change the ranking of " f"the responses." ) new_data = 1 / (values + 1) return new_data
[docs] def apply_add_one_log_transform( self, name: str, values: np.ndarray, raise_error: bool = True ) -> np.ndarray: """ Apply the "addOneLn" (add one and log) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : numpy.ndarray Numpy array containing the feature values. raise_error : bool If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : numpy.ndarray Numpy array that contains the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The addOneLn transformation should not be applied " f"to feature {name} which can have negative values." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has negative values for some responses. " f"If the feature value remains negative after adding one, " f"no score will be generated for such responses." ) new_data = np.log(values + 1) return new_data
[docs] def transform_feature( self, values: np.ndarray, column_name: str, transform: str, raise_error: bool = True ) -> np.ndarray: """ Apply given transform to all values in the given numpy array. The values are assumed to be for the feature with the given name. Parameters ---------- values : numpy.ndarray Numpy array containing the feature values. column_name : str Name of the feature to transform. transform : str Name of the transform to apply. One of {``"inv"``, ``"sqrt"``, ``"log"``, ``"addOneInv"``, ``"addOneLn"``, ``"raw"``, ``"org"``}. raise_error : bool If ``True``, raise a ValueError if a transformation leads to invalid values or may change the ranking of the responses. Defaults to ``True``. Returns ------- new_data : numpy.ndarray Numpy array containing the transformed feature values. Raises ------ ValueError If the given transform is not recognized. Note ---- Many of these transformations may be meaningless for features which span both negative and positive values. Some transformations may throw errors for negative feature values. """ transforms: Dict[str, Callable] = { "inv": self.apply_inverse_transform, "sqrt": self.apply_sqrt_transform, "log": self.apply_log_transform, "addOneInv": self.apply_add_one_inverse_transform, "addOneLn": self.apply_add_one_log_transform, "raw": lambda data: data, "org": lambda data: data, } # make sure we have a valid transform function if transform is None or transform not in transforms: raise ValueError(f"Unrecognized feature transformation: {transform}") transformer = transforms[transform] args = [column_name, values, raise_error] if transform not in ["raw", "org"] else [values] new_data = transformer(*args) return new_data
[docs] def find_feature_transform( self, feature_name: str, feature_value: pd.Series, scores: pd.Series ) -> str: """ Identify best transformation for feature given correlation with score. The best transformation is chosen based on the absolute Pearson correlation with human score. Parameters ---------- feature_name: str Name of feature for which to find the transformation. feature_value: pandas.Series Series containing feature values. scores: pandas.Series Numeric human scores. Returns ------- best_transformation: str The name of the transformation which gives the highest correlation between the feature values and the human scores. See :ref:`documentation <select_transformations_rsmtool>` for the full list of transformations. """ # Do not use sqrt and ln for potential negative features. # Do not use inv for positive features. if any(feature_value < 0): applicable_transformations = ["org", "inv"] else: applicable_transformations = ["org", "sqrt", "addOneInv", "addOneLn"] correlations = [] for trans in applicable_transformations: try: transformed_value = self.transform_feature(feature_value, feature_name, trans) correlations.append(abs(pearsonr(transformed_value, scores)[0])) except ValueError: # If the transformation returns an error, append 0. correlations.append(0) best = np.argmax(correlations) best_transformation = applicable_transformations[best] return best_transformation