Source code for rsmtool.transformer

"""
Class for transforming features.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import logging

import numpy as np
from scipy.stats.stats import pearsonr


[docs]class FeatureTransformer: """Encapsulate feature transformation methods.""" def __init__(self, logger=None): self.logger = logger if logger else logging.getLogger(__name__)
[docs] def apply_sqrt_transform(self, name, values, raise_error=True): """ Apply the "sqrt" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The sqrt transformation should not be " "applied to feature {} which can have " "negative values".format(name)) else: self.logger.warning("The sqrt transformation was " "applied to feature {} which has " "negative values for some responses. " "No system score will be generated " "for such responses".format(name)) with np.errstate(invalid='ignore'): new_data = np.sqrt(values) return new_data
[docs] def apply_log_transform(self, name, values, raise_error=True): """ Apply the "log" transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : numpy array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has zero or negative values and ``raise_error`` is ``True``. """ # check if the feature has any zeros if np.any(values == 0): if raise_error: raise ValueError("The log transformation should not be " "applied to feature {} which can have a " "value of 0".format(name)) else: self.logger.warning("The log transformation was " "applied to feature {} which has a " "value of 0 for some responses. No system " "score will " "be generated for such responses".format(name)) # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The log transformation should not be " "applied to feature {} which can have " "negative values".format(name)) else: self.logger.warning("The log transformation was " "applied to feature {} which has " "negative values for some responses. No system " "score will " "be generated for such responses".format(name)) new_data = np.log(values) return new_data
[docs] def apply_inverse_transform(self, name, values, raise_error=True, sd_multiplier=4): """ Apply the "inv" (inverse) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero values or to a feature that has both positive and negative values. Defaults to ``True``. sd_multiplier : int, optional Use this std. dev. multiplier to compute the ceiling and floor for outlier removal and check that these are not equal to zero. Defaults to 4. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that is zero or to a feature that can have different signs, and ``raise_error`` is ``True``. """ if np.any(values == 0): if raise_error: raise ValueError("The inverse transformation should not be " "applied to feature {} which can have a " "value of 0".format(name)) else: self.logger.warning("The inverse transformation was applied to " "feature {} which has a value of 0 for " "some responses. No system score will be " "generated for such responses".format(name)) # check if the floor or ceiling are zero data_mean = np.mean(values) data_sd = np.std(values, ddof=1) floor = data_mean - sd_multiplier * data_sd ceiling = data_mean + sd_multiplier * data_sd if floor == 0 or ceiling == 0: self.logger.warning("The floor/ceiling for feature {} " "is zero after applying the inverse " "transformation".format(name)) # check if the feature can be both positive and negative all_positive = np.all(np.abs(values) == values) all_negative = np.all(np.abs(values) == -values) if not (all_positive or all_negative): if raise_error: raise ValueError("The inverse transformation should not be " "applied to feature {} where the values can " "have different signs".format(name)) else: self.logger.warning("The inverse transformation was " "applied to feature {} where the values can" "have different signs. This can change " "the ranking of the responses".format(name)) with np.errstate(divide='ignore'): new_data = 1 / values return new_data
[docs] def apply_add_one_inverse_transform(self, name, values, raise_error=True): """ Apply the "addOneInv" (add one and invert) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The addOneInv transformation should not " "be applied to feature {} which can have " "negative values".format(name)) else: self.logger.warning("The addOneInv transformation was " "applied to feature {} which has " "negative values for some responses. " "This can change the ranking of the " "responses".format(name)) new_data = 1 / (values + 1) return new_data
[docs] def apply_add_one_log_transform(self, name, values, raise_error=True): """ Apply the "addOneLn" (add one and log) transform to ``values``. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional If ``True``, raises an error if the transform is applied to a feature that has zero or negative values. Defaults to ``True``. Returns ------- new_data : np.array Numpy array that contains the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and ``raise_error`` is ``True``. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The addOneLn transformation should not " "be applied to feature {} which can have " "negative values".format(name)) else: self.logger.warning("The log transformation was " "applied to feature {} which has " "negative values for some responses. " "If the feature value remains negative " "after adding one, no score will " "be generated for such responses".format(name)) new_data = np.log(values + 1) return new_data
[docs] def transform_feature(self, values, column_name, transform, raise_error=True): """ Apply given transform to all values in the given numpy array. The values are assumed to be for the feature with the given name. Parameters ---------- values : numpy array Numpy array containing the feature values. column_name : str Name of the feature to transform. transform : str Name of the transform to apply. One of {"inv", "sqrt", "log", "addOneInv", "addOneLn", "raw", "org"}. raise_error : bool, optional If ``True``, raise a ValueError if a transformation leads to invalid values or may change the ranking of the responses. Defaults to ``True``. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the given transform is not recognized. Note ---- Many of these transformations may be meaningless for features which span both negative and positive values. Some transformations may throw errors for negative feature values. """ transforms = {'inv': self.apply_inverse_transform, 'sqrt': self.apply_sqrt_transform, 'log': self.apply_log_transform, 'addOneInv': self.apply_add_one_inverse_transform, 'addOneLn': self.apply_add_one_log_transform, 'raw': lambda column_name, data, raise_error: data, 'org': lambda column_name, data, raise_error: data} # make sure we have a valid transform function if transform is None or transform not in transforms: raise ValueError('Unrecognized feature transformation: ' ' {}'.format(transform)) transformer = transforms.get(transform) new_data = transformer(column_name, values, raise_error) return new_data
[docs] def find_feature_transform(self, feature_name, feature_value, scores): """ Identify best transformation for feature given correlation with score. The best transformation is chosen based on the absolute Pearson correlation with human score. Parameters ---------- feature_name: str Name of feature for which to find the transformation. feature_value: pandas Series Series containing feature values. scores: pandas Series Numeric human scores. Returns ------- best_transformation: str The name of the transformation which gives the highest correlation between the feature values and the human scores. See :ref:`documentation <select_transformations_rsmtool>` for the full list of transformations. """ # Do not use sqrt and ln for potential negative features. # Do not use inv for positive features. if any(feature_value < 0): applicable_transformations = ['org', 'inv'] else: applicable_transformations = ['org', 'sqrt', 'addOneInv', 'addOneLn'] correlations = [] for trans in applicable_transformations: try: transformed_value = self.transform_feature(feature_value, feature_name, trans) correlations.append(abs(pearsonr(transformed_value, scores)[0])) except ValueError: # If the transformation returns an error, append 0. correlations.append(0) best = np.argmax(correlations) best_transformation = applicable_transformations[best] return best_transformation