Source code for rsmtool.transformer

"""
Class for transforming features.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import logging
from typing import Callable, Dict, Optional

import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr



[docs]
class FeatureTransformer:
    """Encapsulate feature transformation methods."""

    def __init__(self, logger: Optional[logging.Logger] = None):
        """
        Initialize the FeatureTransformer object.

        Parameters
        ----------
        logger : Optional[logging.Logger]
            Logger object to use in the transformer. If not provided,
            a logger will be created with the name of this class.
        """
        self.logger = logger if logger else logging.getLogger(__name__)


[docs]
    def apply_sqrt_transform(
        self, name: str, values: np.ndarray, raise_error: bool = True
    ) -> np.ndarray:
        """
        Apply the "sqrt" transform to ``values``.

        Parameters
        ----------
        name : str
            Name of the feature to transform.
        values : numpy.ndarray
            Numpy array containing the feature values.
        raise_error : bool
            If ``True``, raises an error if the transform is applied to
            a feature that has negative values.
            Defaults to ``True``.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array containing the transformed feature values.

        Raises
        ------
        ValueError
            If the transform is applied to a feature that has negative
            values and ``raise_error`` is ``True``.
        """
        # check if the feature has any negative values
        if np.any(values < 0):
            if raise_error:
                raise ValueError(
                    f"The sqrt transformation should not be applied to "
                    f"feature {name} which can have negative values."
                )
            else:
                self.logger.warning(
                    f"The sqrt transformation was applied to feature "
                    f"{name} which has negative values for some responses. "
                    f"No system score will be generated for such responses"
                )

        with np.errstate(invalid="ignore"):
            new_data = np.sqrt(values)
        return new_data



[docs]
    def apply_log_transform(
        self, name: str, values: np.ndarray, raise_error: bool = True
    ) -> np.ndarray:
        """
        Apply the "log" transform to ``values``.

        Parameters
        ----------
        name : str
            Name of the feature to transform.
        values : numpy.ndarray
            Numpy array containing the feature values.
        raise_error : bool
            If ``True``, raises an error if the transform is applied to
            a feature that has zero or negative values.
            Defaults to ``True``.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array containing the transformed feature values.

        Raises
        ------
        ValueError
            If the transform is applied to a feature that has
            zero or negative values and ``raise_error`` is ``True``.
        """
        # check if the feature has any zeros
        if np.any(values == 0):
            if raise_error:
                raise ValueError(
                    f"The log transformation should not be applied to "
                    f"feature {name} which can have a value of 0."
                )
            else:
                self.logger.warning(
                    f"The log transformation was applied to feature "
                    f"{name} which has a value of 0 for some responses. "
                    f"No system score will be generated for such responses."
                )

        # check if the feature has any negative values
        if np.any(values < 0):
            if raise_error:
                raise ValueError(
                    f"The log transformation should not be applied to "
                    f"feature {name} which can have negative values."
                )
            else:
                self.logger.warning(
                    f"The log transformation was applied to feature "
                    f"{name} which has negative values for some responses. "
                    f"No system score will be generated for such responses"
                )

        new_data = np.log(values)
        return new_data



[docs]
    def apply_inverse_transform(
        self, name: str, values: np.ndarray, raise_error: bool = True, sd_multiplier: int = 4
    ) -> np.ndarray:
        """
        Apply the "inv" (inverse) transform to ``values``.

        Parameters
        ----------
        name : str
            Name of the feature to transform.
        values : numpy.ndarray
            Numpy array containing the feature values.
        raise_error : bool
            If ``True``, raises an error if the transform is applied to a feature
            that has zero values or to a feature that has both positive and
            negative values.
            Defaults to ``True``.
        sd_multiplier : int
            Use this std. dev. multiplier to compute the ceiling and floor for
            outlier removal and check that these are not equal to zero.
            Defaults to 4.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array containing the transformed feature values.

        Raises
        ------
        ValueError
            If the transform is applied to a feature that is
            zero or to a feature that can have different
            signs, and ``raise_error`` is ``True``.
        """
        if np.any(values == 0):
            if raise_error:
                raise ValueError(
                    f"The inverse transformation should not be applied "
                    f"to feature {name} which can have a value of 0."
                )
            else:
                self.logger.warning(
                    f"The inverse transformation was applied to feature "
                    f"{name} which has a value of 0 for some responses. "
                    f"No system score will be generated for such responses."
                )

        # check if the floor or ceiling are zero
        data_mean = np.mean(values)
        data_sd = np.std(values, ddof=1)
        floor = data_mean - sd_multiplier * data_sd
        ceiling = data_mean + sd_multiplier * data_sd
        if floor == 0 or ceiling == 0:
            self.logger.warning(
                f"The floor/ceiling for feature {name} is zero after "
                f"applying the inverse transformation."
            )

        # check if the feature can be both positive and negative
        all_positive = np.all(np.abs(values) == values)
        all_negative = np.all(np.abs(values) == -values)
        if not (all_positive or all_negative):
            if raise_error:
                raise ValueError(
                    f"The inverse transformation should not be applied "
                    f"to feature {name} where the values can have different signs"
                )
            else:
                self.logger.warning(
                    f"The inverse transformation was applied to feature "
                    f"{name} where the values can have different signs. "
                    f"This can change the ranking of the responses."
                )

        with np.errstate(divide="ignore"):
            new_data = 1 / values

        return new_data



[docs]
    def apply_add_one_inverse_transform(
        self, name: str, values: np.ndarray, raise_error: bool = True
    ) -> np.ndarray:
        """
        Apply the "addOneInv" (add one and invert) transform to ``values``.

        Parameters
        ----------
        name : str
            Name of the feature to transform.
        values : numpy.ndarray
            Numpy array containing the feature values.
        raise_error : bool
            If ``True``, raises an error if the transform is applied to
            a feature that has zero or negative values.
            Defaults to ``True``.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array containing the transformed feature values.

        Raises
        ------
        ValueError
            If the transform is applied to a feature that
            has negative values and ``raise_error`` is ``True``.
        """
        # check if the feature has any negative values
        if np.any(values < 0):
            if raise_error:
                raise ValueError(
                    f"The addOneInv transformation should not be applied "
                    f"to feature {name} which can have negative values."
                )
            else:
                self.logger.warning(
                    f"The addOneInv transformation was applied to "
                    f"feature {name} which has negative values for "
                    f"some responses. This can change the ranking of "
                    f"the responses."
                )

        new_data = 1 / (values + 1)
        return new_data



[docs]
    def apply_add_one_log_transform(
        self, name: str, values: np.ndarray, raise_error: bool = True
    ) -> np.ndarray:
        """
        Apply the "addOneLn" (add one and log) transform to ``values``.

        Parameters
        ----------
        name : str
            Name of the feature to transform.
        values : numpy.ndarray
            Numpy array containing the feature values.
        raise_error : bool
            If ``True``, raises an error if the transform is applied to
            a feature that has zero or negative values.
            Defaults to ``True``.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array that contains the transformed feature values.

        Raises
        ------
        ValueError
            If the transform is applied to a feature that
            has negative values and ``raise_error`` is ``True``.
        """
        # check if the feature has any negative values
        if np.any(values < 0):
            if raise_error:
                raise ValueError(
                    f"The addOneLn transformation should not be applied "
                    f"to feature {name} which can have negative values."
                )
            else:
                self.logger.warning(
                    f"The log transformation was applied to feature "
                    f"{name} which has negative values for some responses. "
                    f"If the feature value remains negative after adding one, "
                    f"no score will be generated for such responses."
                )

        new_data = np.log(values + 1)
        return new_data



[docs]
    def transform_feature(
        self, values: np.ndarray, column_name: str, transform: str, raise_error: bool = True
    ) -> np.ndarray:
        """
        Apply given transform to all values in the given numpy array.

        The values are assumed to be for the feature with the given name.

        Parameters
        ----------
        values : numpy.ndarray
            Numpy array containing the feature values.
        column_name : str
            Name of the feature to transform.
        transform : str
            Name of the transform to apply. One of {``"inv"``, ``"sqrt"``,
            ``"log"``, ``"addOneInv"``, ``"addOneLn"``, ``"raw"``, ``"org"``}.
        raise_error : bool
            If ``True``, raise a ValueError if a transformation leads to
            invalid values or may change the ranking of the responses.
            Defaults to ``True``.

        Returns
        -------
        new_data : numpy.ndarray
            Numpy array containing the transformed feature values.

        Raises
        ------
        ValueError
            If the given transform is not recognized.

        Note
        ----
        Many of these transformations may be meaningless for features which
        span both negative and positive values. Some transformations may
        throw errors for negative feature values.
        """
        transforms: Dict[str, Callable] = {
            "inv": self.apply_inverse_transform,
            "sqrt": self.apply_sqrt_transform,
            "log": self.apply_log_transform,
            "addOneInv": self.apply_add_one_inverse_transform,
            "addOneLn": self.apply_add_one_log_transform,
            "raw": lambda data: data,
            "org": lambda data: data,
        }

        # make sure we have a valid transform function
        if transform is None or transform not in transforms:
            raise ValueError(f"Unrecognized feature transformation: {transform}")

        transformer = transforms[transform]
        args = [column_name, values, raise_error] if transform not in ["raw", "org"] else [values]
        new_data = transformer(*args)
        return new_data



[docs]
    def find_feature_transform(
        self, feature_name: str, feature_value: pd.Series, scores: pd.Series
    ) -> str:
        """
        Identify best transformation for feature given correlation with score.

        The best transformation is chosen based on the absolute Pearson
        correlation with human score.

        Parameters
        ----------
        feature_name: str
            Name of feature for which to find the transformation.
        feature_value: pandas.Series
            Series containing feature values.
        scores: pandas.Series
            Numeric human scores.

        Returns
        -------
        best_transformation: str
            The name of the transformation which gives the highest correlation
            between the feature values and the human scores. See
            :ref:`documentation <select_transformations_rsmtool>` for the
            full list of transformations.
        """
        # Do not use sqrt and ln for potential negative features.
        # Do not use inv for positive features.
        if any(feature_value < 0):
            applicable_transformations = ["org", "inv"]
        else:
            applicable_transformations = ["org", "sqrt", "addOneInv", "addOneLn"]

        correlations = []
        for trans in applicable_transformations:
            try:
                transformed_value = self.transform_feature(feature_value, feature_name, trans)

                correlations.append(abs(pearsonr(transformed_value, scores)[0]))
            except ValueError:
                # If the transformation returns an error, append 0.
                correlations.append(0)
        best = np.argmax(correlations)
        best_transformation = applicable_transformations[best]
        return best_transformation