Source code for rsmtool.transformer

"""
Class for transforming features.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import logging

import numpy as np
from scipy.stats.stats import pearsonr


[docs]class FeatureTransformer: """Encapsulate feature transformation methods.""" def __init__(self, logger=None): """Initialize the FeatureTransformer object.""" self.logger = logger if logger else logging.getLogger(__name__)
[docs] def apply_sqrt_transform(self, name, values, raise_error=True): """ Apply the "sqrt" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The sqrt transformation should not be applied to " f"feature {name} which can have negative values." ) else: self.logger.warning( f"The sqrt transformation was applied to feature " f"{name} which has negative values for some responses. " f"No system score will be generated for such responses" ) with np.errstate(invalid="ignore"): new_data = np.sqrt(values) return new_data
[docs] def apply_log_transform(self, name, values, raise_error=True): """ Apply the "log" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : numpy array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has zero or negative values and ``raise_error`` is ``True``. """ # check if the feature has any zeros if np.any(values == 0): if raise_error: raise ValueError( f"The log transformation should not be applied to " f"feature {name} which can have a value of 0." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has a value of 0 for some responses. " f"No system score will be generated for such responses." ) # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The log transformation should not be applied to " f"feature {name} which can have negative values." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has negative values for some responses. " f"No system score will be generated for such responses" ) new_data = np.log(values) return new_data
[docs] def apply_inverse_transform(self, name, values, raise_error=True, sd_multiplier=4): """ Apply the "inv" (inverse) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero values or to a feature that has both positive and negative values. Defaults to ``True``. sd_multiplier : int, optional Use this std. dev. multiplier to compute the ceiling and floor for outlier removal and check that these are not equal to zero. Defaults to 4. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that is zero or to a feature that can have different signs, and ``raise_error`` is ``True``. """ if np.any(values == 0): if raise_error: raise ValueError( f"The inverse transformation should not be applied " f"to feature {name} which can have a value of 0." ) else: self.logger.warning( f"The inverse transformation was applied to feature " f"{name} which has a value of 0 for some responses. " f"No system score will be generated for such responses." ) # check if the floor or ceiling are zero data_mean = np.mean(values) data_sd = np.std(values, ddof=1) floor = data_mean - sd_multiplier * data_sd ceiling = data_mean + sd_multiplier * data_sd if floor == 0 or ceiling == 0: self.logger.warning( f"The floor/ceiling for feature {name} is zero after " f"applying the inverse transformation." ) # check if the feature can be both positive and negative all_positive = np.all(np.abs(values) == values) all_negative = np.all(np.abs(values) == -values) if not (all_positive or all_negative): if raise_error: raise ValueError( f"The inverse transformation should not be applied " f"to feature {name} where the values can have different signs" ) else: self.logger.warning( f"The inverse transformation was applied to feature " f"{name} where the values can have different signs. " f"This can change the ranking of the responses." ) with np.errstate(divide="ignore"): new_data = 1 / values return new_data
[docs] def apply_add_one_inverse_transform(self, name, values, raise_error=True): """ Apply the "addOneInv" (add one and invert) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The addOneInv transformation should not be applied " f"to feature {name} which can have negative values." ) else: self.logger.warning( f"The addOneInv transformation was applied to " f"feature {name} which has negative values for " f"some responses. This can change the ranking of " f"the responses." ) new_data = 1 / (values + 1) return new_data
[docs] def apply_add_one_log_transform(self, name, values, raise_error=True): """ Apply the "addOneLn" (add one and log) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array that contains the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError( f"The addOneLn transformation should not be applied " f"to feature {name} which can have negative values." ) else: self.logger.warning( f"The log transformation was applied to feature " f"{name} which has negative values for some responses. " f"If the feature value remains negative after adding one, " f"no score will be generated for such responses." ) new_data = np.log(values + 1) return new_data
[docs] def transform_feature(self, values, column_name, transform, raise_error=True): """ Apply given transform to all values in the given numpy array. The values are assumed to be for the feature with the given name. Parameters ---------- values : numpy array Numpy array containing the feature values. column_name : str Name of the feature to transform. transform : str Name of the transform to apply. One of {"inv", "sqrt", "log", "addOneInv", "addOneLn", "raw", "org"}. raise_error : bool, optional If ``True``, raise a ValueError if a transformation leads to invalid values or may change the ranking of the responses. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the given transform is not recognized. Note ---- Many of these transformations may be meaningless for features which span both negative and positive values. Some transformations may throw errors for negative feature values. """ transforms = { "inv": self.apply_inverse_transform, "sqrt": self.apply_sqrt_transform, "log": self.apply_log_transform, "addOneInv": self.apply_add_one_inverse_transform, "addOneLn": self.apply_add_one_log_transform, "raw": lambda column_name, data, raise_error: data, "org": lambda column_name, data, raise_error: data, } # make sure we have a valid transform function if transform is None or transform not in transforms: raise ValueError(f"Unrecognized feature transformation: {transform}") transformer = transforms.get(transform) new_data = transformer(column_name, values, raise_error) return new_data
[docs] def find_feature_transform(self, feature_name, feature_value, scores): """ Identify best transformation for feature given correlation with score. The best transformation is chosen based on the absolute Pearson correlation with human score. Parameters ---------- feature_name: str Name of feature for which to find the transformation. feature_value: pandas Series Series containing feature values. scores: pandas Series Numeric human scores. Returns ------- best_transformation: str The name of the transformation which gives the highest correlation between the feature values and the human scores. See :ref:`documentation <select_transformations_rsmtool>` for the full list of transformations. """ # Do not use sqrt and ln for potential negative features. # Do not use inv for positive features. if any(feature_value < 0): applicable_transformations = ["org", "inv"] else: applicable_transformations = ["org", "sqrt", "addOneInv", "addOneLn"] correlations = [] for trans in applicable_transformations: try: transformed_value = self.transform_feature(feature_value, feature_name, trans) correlations.append(abs(pearsonr(transformed_value, scores)[0])) except ValueError: # If the transformation returns an error, append 0. correlations.append(0) best = np.argmax(correlations) best_transformation = applicable_transformations[best] return best_transformation