"""
Classes for preprocessing input data in various contexts.
:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import logging
import re
import warnings
from sys import version_info
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
from numpy.random import RandomState
from rsmtool.configuration_parser import Configuration
from .container import DataContainer, DatasetDict
from .reader import DataReader
from .reporter import Reporter
from .transformer import FeatureTransformer
from .utils.conversion import convert_to_float
from .utils.models import is_built_in_model, is_skll_model
[docs]
class FeatureSubsetProcessor:
"""Class to encapsulate feature sub-setting methods."""
def __init__(self, logger: Optional[logging.Logger] = None):
"""Initialize the FeatureSubsetProcessor object."""
self.logger = logger if logger else logging.getLogger(__name__)
[docs]
def select_by_subset(
self, feature_columns: List[str], feature_subset_specs: pd.DataFrame, subset: str
) -> List[str]:
"""
Select feature columns using feature subset specifications.
Parameters
----------
feature_columns : List[str]
A list of feature columns
feature_subset_specs : pandas.DataFrame
The feature subset specification data frame.
subset : str
The column to subset.
Returns
-------
feature_names : List[str]
A list of feature names to include.
"""
feature_subset = feature_subset_specs[feature_subset_specs[subset] == 1]["Feature"]
feature_names = [feature for feature in feature_columns if feature in feature_subset.values]
# check whether there are any features in the data file and raise warning
if len(feature_columns) != len(feature_names):
feature_subset_specs_set = set(feature_subset_specs["Feature"])
extra_columns = set(feature_columns).difference(feature_subset_specs_set)
if extra_columns:
self.logger.warning(
f"No subset information was available for the following "
f"columns in the input file. These columns will not be "
f"used in the model: {', '.join(extra_columns)}"
)
if len(feature_subset) != len(feature_names):
extra_subset_features = set(feature_subset).difference(set(feature_names))
if extra_subset_features:
self.logger.warning(
f"The following features were included into the {subset} "
f"subset in the feature_subset_file but were not specified "
f"in the input data: {', '.join(extra_subset_features)}"
)
return feature_names
[docs]
def check_feature_subset_file(
self, df: pd.DataFrame, subset: Optional[str] = None, sign: Optional[str] = None
) -> None:
"""
Check that feature subset file is complete and in the correct format.
Raises an exception if it finds any errors but otherwise returns nothing.
Parameters
----------
df : pandas.DataFrame
The data frame containing the feature subset file.
subset : Optional[str]
Name of a pre-defined feature subset.
Defaults to ``None``.
sign : Optional[str]
Value of the sign.
Defaults to ``None``.
Raises
------
ValueError
If any columns are missing from the subset file.
ValueError
If any of the columns contain invalid values.
"""
# we want to allow title-cased names of columns for historical reasons
# e.g., `Feature` instead of `feature` etc.
df_feature_specs = df.copy()
if "feature" not in df_feature_specs and "Feature" not in df_feature_specs:
raise ValueError(
"The feature_subset_file must contain "
"a column named 'feature' "
"containing the feature names."
)
if subset:
if subset not in df_feature_specs:
raise ValueError(f"Unknown value for feature_subset: {subset}")
if not df_feature_specs[subset].isin([0, 1]).all():
raise ValueError("The subset columns in feature " "file can only contain 0 or 1")
if sign:
possible_sign_columns = [f"sign_{sign}", f"Sign_{sign}"]
existing_sign_columns = [c for c in possible_sign_columns if c in df_feature_specs]
if len(existing_sign_columns) > 1:
raise ValueError(
f"The feature_subset_file contains multiple columns "
f"for sign: {' ,'.join(existing_sign_columns)}"
)
elif len(existing_sign_columns) == 0:
raise ValueError(
f"The feature_subset_file must contain the requested "
f"sign column 'sign_{sign}'"
)
else:
sign_column = existing_sign_columns[0]
if not df_feature_specs[sign_column].isin(["-", "+"]).all():
raise ValueError("The sign columns in feature " "file can only contain - or +")
[docs]
class FeatureSpecsProcessor:
"""Encapsulate feature file processing methods."""
def __init__(self, logger: Optional[logging.Logger] = None):
"""Initialize the FeatureSpecsProcessor object."""
self.logger = logger if logger else logging.getLogger(__name__)
[docs]
def generate_default_specs(self, feature_names: List[str]) -> pd.DataFrame:
"""
Generate default feature "specifications" for given feature names.
The specifications are stored as a data frame with three columns
"feature", "transform", and "sign".
Parameters
----------
feature_names: List[str]
List of feature names for which to generate specifications.
Returns
-------
feature_specs: pandas.DataFrame
A dataframe with feature specifications that can be saved as a
:ref:`feature list file <example_feature_csv>`.
Note
----
Since these are default specifications, the values for the
"transform" column for each feature will be "raw" and the
value for the "sign" column will be ``1``.
"""
df_feature_specs = pd.DataFrame({"feature": feature_names})
df_feature_specs["transform"] = "raw"
df_feature_specs["sign"] = 1.0
return df_feature_specs
[docs]
def find_feature_sign(self, feature: str, sign_dict: Dict[str, str]) -> float:
"""
Get the feature sign from the feature CSV file.
Parameters
----------
feature : str
The name of the feature.
sign_dict : dict
A dictionary of feature signs.
Returns
-------
feature_sign_numeric : float
The signed feature.
"""
if feature not in sign_dict.keys():
self.logger.warning(
f"No information about sign is available for feature "
f"{feature}. The feature will be assigned the default "
f"positive weight."
)
feature_sign_numeric = 1.0
else:
feature_sign_string = sign_dict[feature]
feature_sign_numeric = -1.0 if feature_sign_string == "-" else 1.0
return feature_sign_numeric
[docs]
def validate_feature_specs(
self, df: pd.DataFrame, use_truncations: bool = False
) -> pd.DataFrame:
"""
Validate given feature specifications.
Check given feature specifications to make sure that there are no duplicate
feature names and that all columns are in the right format. Add the
default values for "transform" and "sign" if none are given.
Parameters
----------
df : pandas.DataFrame
The feature specification DataFrame to validate.
use_truncations : bool
Whether to use truncation values. If this is ``True`` and truncation
values are not specified, an exception is raised.
Defaults to ``False``.
Returns
-------
df_specs_new : pandas.DataFrame
The output data frame with normalized values.
Raises
------
KeyError
If the input data frame does not have a "feature" column.
ValueError
If there are duplicate values in the "feature" column.
ValueError
if the "sign" column contains invalid values.
ValueError
If ``use_truncations`` is set to ``True``, and no
"min" and "max" columns exist in the data set.
"""
df_specs_org = df
df_specs_new = df_specs_org.copy()
expected_columns = ["feature", "sign", "transform"]
# we allow internally the use of 'Feature' since
# this is the column name in subset_feature_file.
if "Feature" in df_specs_org:
df_specs_new["feature"] = df_specs_org["Feature"]
# check that we have a column named `feature`
if "feature" not in df_specs_new:
raise KeyError("The feature file must contain a " "column named 'feature'")
# check to make sure that there are no duplicate feature names
feature_name_count = df_specs_new["feature"].value_counts()
duplicate_features = feature_name_count[feature_name_count > 1]
if len(duplicate_features) > 0:
raise ValueError(
f"The following feature names are duplicated in "
f"the feature file: {duplicate_features.index}"
)
# if we have `sign` column, check that it can be converted to float
if "sign" in df_specs_new:
try:
df_specs_new["sign"] = df_specs_new["sign"].astype(float)
assert np.all(df_specs_new["sign"].isin([-1, 1]))
except (ValueError, AssertionError):
raise ValueError(
"The `sign` column in the feature" "file can only contain '1' or '-1'"
)
else:
df_specs_new["sign"] = 1
if "transform" not in df_specs_new:
df_specs_new["transform"] = "raw"
if use_truncations:
if not all(col in df_specs_new for col in ["min", "max"]):
raise ValueError(
"The ``use_truncation_thresholds`` configuration option "
"was specified, but no ``min`` or ``max`` columns exist "
"in the feature file."
)
# add ``min`` and ``max`` to the
# list of expected columns
expected_columns.extend(["min", "max"])
df_specs_new = df_specs_new[expected_columns]
return df_specs_new
[docs]
def generate_specs(
self,
df: pd.DataFrame,
feature_names: List[str],
train_label: str,
feature_subset: Optional[pd.DataFrame] = None,
feature_sign: Optional[int] = None,
) -> pd.DataFrame:
"""
Generate feature specifications using the feature CSV file.
Compute the specifications for "sign" and the correlation with score
to identify the best transformation.
Parameters
----------
df : pandas.DataFrame
The input data frame from which to generate the specifications.
feature_names : List[str]
A list of feature names.
train_label : str
The label column for the training data
feature_subset : Optional[pandas.DataFrame]
A data frame containing the feature subset specifications, if any.
Defaults to ``None``.
feature_sign : Optional[int]
The sign of the feature.
Defaults to ``None``.
Returns
-------
df_feature_specs : pandas.DataFrame
The output data frame containing the feature specifications.
"""
# get feature sign information, if available
if feature_sign and feature_subset is not None:
# Convert to dictionary {feature:sign}
sign_dict = dict(zip(feature_subset.Feature, feature_subset[f"Sign_{feature_sign}"]))
# else create an empty dictionary
else:
sign_dict = {}
feature_specs = []
feature_dict: Dict[str, Any] = {}
for feature in feature_names:
feature_dict["feature"] = feature
feature_dict["transform"] = FeatureTransformer().find_feature_transform(
feature, df[feature], df[train_label]
)
feature_dict["sign"] = self.find_feature_sign(feature, sign_dict)
# Change the sign for inverse and addOneInv transformations
if feature_dict["transform"] in ["inv", "addOneInv"]:
feature_dict["sign"] = feature_dict["sign"] * -1
feature_specs.append(feature_dict)
feature_dict = {}
df_feature_specs = pd.DataFrame(feature_specs)
return df_feature_specs
[docs]
class FeaturePreprocessor:
"""Class to preprocess features in training and testing sets."""
def __init__(self, logger: Optional[logging.Logger] = None):
"""Initialize the FeaturePreprocessor object."""
self.logger = logger if logger else logging.getLogger(__name__)
[docs]
def check_model_name(self, model_name: str) -> str:
"""
Check that the given model name is valid and determine its type.
Parameters
----------
model_name : str
Name of the model.
Returns
-------
model_type: str
One of "BUILTIN" or "SKLL".
Raises
------
ValueError
If the model is not supported.
"""
if is_built_in_model(model_name):
model_type = "BUILTIN"
elif is_skll_model(model_name):
model_type = "SKLL"
else:
raise ValueError(
f"The specified model {model_name} was not found. " f"Please check the spelling."
)
return model_type
[docs]
def trim(
self,
values: Union[List[float], np.ndarray],
trim_min: float,
trim_max: float,
tolerance: float = 0.4998,
) -> np.ndarray:
"""
Trim values in given numpy array.
The trimming uses ``trim_min`` - ``tolerance`` as the floor and
``trim_max`` + ``tolerance`` as the ceiling.
Parameters
----------
values : Union[List[float], numpy.ndarray]
The values to trim.
trim_min : float
The lowest score on the score point, used for
trimming the raw regression predictions.
trim_max : float
The highest score on the score point, used for
trimming the raw regression predictions.
tolerance : float
The tolerance that will be used to compute the
trim interval.
Defaults to ``0.4998``.
Returns
-------
trimmed_values : numpy.ndarray
Numpy array containing the trimmed values."
Raises
------
ValueError
If ``trim_min``, ``trim_max``, or ``tolerance`` are ``None``.
"""
if isinstance(values, list):
values = np.array(values)
if trim_min is None or trim_max is None or tolerance is None:
raise ValueError("trim_min, trim_max, and tolerance must be numeric values.")
new_max = trim_max + tolerance
new_min = trim_min - tolerance
trimmed_values = values.copy()
trimmed_values[trimmed_values > new_max] = new_max
trimmed_values[trimmed_values < new_min] = new_min
return trimmed_values
[docs]
@staticmethod
def remove_outliers(
values: np.ndarray,
mean: Optional[float] = None,
sd: Optional[float] = None,
sd_multiplier: int = 4,
) -> np.ndarray:
r"""
Remove outliers from given array of values by clamping them.
Clamp any given values that are
± ``sd_multiplier`` (:math:`m`) standard deviations (:math:`\\sigma`)
away from the mean (:math:`\\mu`). Use given ``mean`` and ``sd``
instead of computing :math:`\\sigma` and :math:`\\mu`, if specified.
The values are clamped to the interval:
.. math::
[\\mu - m * \\sigma, \\mu + m * \\sigma]
Parameters
----------
values : numpy.ndarray
The values from which to remove outliers, usually corresponding to
a given feature.
mean : Optional[float]
Use the given mean value when computing outliers instead of the mean
from the data.
Defaults to ``None``.
sd : Optional[float]
Use the given std. dev. value when computing outliers instead of the
std. dev. from the data.
Defaults to ``None``.
sd_multiplier : int
Use the given multipler for the std. dev. when computing the outliers.
Defaults to 4.
Returns
-------
new_values : numpy.ndarray
Numpy array with the outliers clamped.
"""
# convert data to a numpy float array before doing any clamping
new_values = np.array(values, dtype=np.float64)
mean = mean if mean else new_values.mean()
sd = sd if sd else new_values.std()
floor = mean - sd_multiplier * sd
ceiling = mean + sd_multiplier * sd
new_values[new_values > ceiling] = ceiling
new_values[new_values < floor] = floor
return new_values
[docs]
def remove_outliers_using_truncations(
self, values: np.ndarray, feature_name: str, truncations: pd.DataFrame
) -> np.ndarray:
"""
Remove outliers using pre-specified truncation groups.
This is different from ``remove_outliers()`` which calculates the
outliers based on the training set rather than looking up the truncation
values from a pre-specified data frame.
Parameters
----------
values : numpy.ndarray
The values from which to remove outliers, usually corresponding
to a given feature.
feature_name : str
Name of the feature whose outliers are being clamped.
truncations : pandas.DataFrame
A data frame with truncation values. The features should be set as
the index.
Returns
-------
new_values : numpy.ndarray
Numpy array with the outliers clamped.
"""
# convert data to a numpy float array before doing any clamping
new_values = np.array(values, dtype=np.float64)
minimum = truncations.loc[feature_name, "min"]
maximum = truncations.loc[feature_name, "max"]
new_values[new_values > maximum] = maximum
new_values[new_values < minimum] = minimum
return new_values
[docs]
def select_candidates(
self, df: pd.DataFrame, N: int, candidate_col: str = "candidate"
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Select candidates which have responses to N or more items.
Parameters
----------
df : pandas.DataFrame
The data frame from which to select candidates with N or more items.
N: int
Minimal number of items per candidate
candidate_col : str
Name of the column which contains candidate ids.
Defaults to "candidate".
Returns
-------
df_included: pandas.DataFrame
Data frame with responses from candidates with responses to N
or more items.
df_excluded: pandas.DataFrame
Data frame with responses from candidates with responses to
less than N items.
"""
items_per_candidate = df[candidate_col].value_counts()
selected_candidates = items_per_candidate[items_per_candidate >= N]
selected_candidates = selected_candidates.index
df_included = df[df[candidate_col].isin(selected_candidates)].copy()
df_excluded = df[~df[candidate_col].isin(selected_candidates)].copy()
# reset indices
df_included.reset_index(drop=True, inplace=True)
df_excluded.reset_index(drop=True, inplace=True)
return (df_included, df_excluded)
[docs]
def check_subgroups(self, df: pd.DataFrame, subgroups: List[str]) -> pd.DataFrame:
"""
Validate subgroup names in the given data.
Check that all subgroups, if specified, correspond to columns in the
provided data frame, and replace all NaNs in subgroups values with
'No info' for later convenience.
Raises an exception if any specified subgroup columns are missing.
Parameters
----------
df : pandas.DataFrame
Input data frame with subgroups to check.
subgroups : List[str]
List of column names that contain grouping information.
Returns
-------
df : pandas.DataFrame
Modified input data frame with NaNs replaced.
Raises
------
KeyError
If the data does not contain columns for all specified subgroups.
"""
missing_sub_cols = set(subgroups).difference(df.columns)
if missing_sub_cols:
raise KeyError(
f"The data does not contain columns for all subgroups "
f"specified in the configuration file. Please check "
f"for capitalization and other spelling errors and make "
f"sure the subgroup names do not contain hyphens. The "
f"data does not have columns for the following "
f"subgroups: {', '.join(missing_sub_cols)}"
)
# replace any empty values in subgroups values by "No info"
empty_value = re.compile(r"^\s*$")
df[subgroups] = df[subgroups].replace(to_replace=empty_value, value="No info")
return df
[docs]
def rename_default_columns(
self,
df: pd.DataFrame,
requested_feature_names: List[str],
id_column: str,
first_human_score_column: Union[str, None],
second_human_score_column: Union[str, None],
length_column: Union[str, None],
system_score_column: Union[str, None],
candidate_column: Union[str, None],
) -> pd.DataFrame:
"""
Standardize column names and rename columns with reserved column names.
RSMTool reserves some column names for internal use, e.g., "sc1",
"spkitemid" etc. If the given data already contains columns with
these names, then they must be renamed to prevent conflict. This
method renames such columns to "##NAME##", e.g., an existing column
named "sc1" will be renamed to "##sc1##".
Parameters
----------
df : pandas.DataFrame
The data frame containing the columns to rename.
requested_feature_names : List[str]
List of feature column names that we want to include in the
scoring model.
id_column : str
Column name containing the response IDs.
first_human_score_column : Union[str, None]
Column name containing the H1 scores. Should be ``None`` if no H1
scores are available.
second_human_score_column : Union[str, None]
Column name containing the H2 scores. Should be ``None`` if no H2
scores are available.
length_column : Union[str, None]
Column name containing response lengths. Should be ``None`` if
lengths are not available.
system_score_column : Union[str, None]
Column name containing the score predicted by the system. This is
only used for rsmeval.
candidate_column : Union[str, None]
Column name containing identifying information at the candidate level.
Should be ``None`` if such information is not available.
Returns
-------
df : pandas.DataFrame
Modified input data frame with all the approximate
re-namings.
"""
df = df.copy()
columns = [
id_column,
first_human_score_column,
second_human_score_column,
length_column,
system_score_column,
candidate_column,
]
defaults = ["spkitemid", "sc1", "sc2", "length", "raw", "candidate"]
# create a dictionary of name mapping for used columns
name_mapping = dict(filter(lambda t: t[0] is not None, zip(columns, defaults)))
# find the columns where the names match the default names
correct_defaults = [
column for (column, default) in name_mapping.items() if column == default
]
# find the columns with default names reserved for other columns
# which are not used as features in the model
columns_with_incorrect_default_names = [
column
for column in df.columns
if (
column in defaults
and column not in correct_defaults
and column not in requested_feature_names
)
]
# rename these columns
if columns_with_incorrect_default_names:
new_column_names = [f"##{column}##" for column in columns_with_incorrect_default_names]
df.rename(
columns=dict(zip(columns_with_incorrect_default_names, new_column_names)),
inplace=True,
)
# find the columns where the names do not match the default
columns_with_custom_names = [
column for column in name_mapping if column not in correct_defaults
]
# rename the custom-named columns to default values
for column in columns_with_custom_names:
# if the column has already been renamed because it used a
# default name, then use the updated name
if column in columns_with_incorrect_default_names:
df.rename(columns={f"##{column}##": name_mapping[column]}, inplace=True)
else:
df.rename(columns={column: name_mapping[column]}, inplace=True)
return df
[docs]
def filter_on_column(
self,
df: pd.DataFrame,
column: str,
exclude_zeros: bool = False,
exclude_zero_sd: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
r"""
Filter out rows containing non-numeric values.
Filter out the rows in the given data frame that contain non-numeric
(or zero, if specified) values in the specified column. Additionally,
it may exclude any columns if they have a standard deviation
(:math:`\\sigma`) of 0.
Parameters
----------
df : pandas.DataFrame
The data frame containing the data to be filtered.
column : str
Name of the column from which to filter out values.
exclude_zeros : bool
Whether to exclude responses containing zeros in the specified column.
Defaults to ``False``.
exclude_zero_sd : bool
Whether to perform the additional filtering step of removing columns
that have :math:`\\sigma = 0`.
Defaults to ``False``.
Returns
-------
df_filtered : pandas.DataFrame
Data frame containing the responses that were *not* filtered out.
df_excluded : pandas.DataFrame
Data frame containing the non-numeric or zero responses that
were filtered out.
Note
----
The columns with :math:`\\sigma=0` are removed from both output
data frames, assuming ``exclude_zero_scores`` is ``True``.
"""
# create a copy of the original data frame
df_filter = df.copy()
# we start out assuming that we will not drop this column
drop_column = False
# return a copy of the original data frame if
# the given column does not exist at all
if column not in df.columns:
return df_filter
# Force convert the label column to numeric and
# convert whatever can't be converted to a NaN
df_filter[column] = pd.to_numeric(df_filter[column], errors="coerce").astype(float)
# Save the values that have been converted to NaNs
# as a separate data frame. We want to keep them as NaNs
# to do more analyses later. We also filter out inf values.
# Since these can only be generated during transformations,
# we include them with NaNs for consistency.
bad_rows = df_filter[column].isnull() | np.isinf(df_filter[column])
df_bad_rows = df_filter[bad_rows]
# if the column contained only non-numeric values, we need to drop it
if len(df_bad_rows) == len(df_filter):
self.logger.info(
f"Feature {column} was excluded from the model "
f"because it only contains non-numeric values."
)
drop_column = True
# now drop the above bad rows containing NaNs from our data frame
df_filter = df_filter[~bad_rows]
# exclude zeros if specified
if exclude_zeros:
zero_rows = df_filter[column] == 0
df_zero_rows = df_filter[zero_rows]
df_filter = df_filter[~zero_rows]
else:
df_zero_rows = pd.DataFrame()
# combine all the filtered rows into a single data frame
df_exclude = pd.concat([df_bad_rows, df_zero_rows], sort=True)
# reset the index so that the indexing works correctly
# for the next feature with missing values
df_filter.reset_index(drop=True, inplace=True)
df_exclude.reset_index(drop=True, inplace=True)
# Check if the the standard deviation equals zero:
# for training set sd == 0 will break normalization.
# We set the tolerance level to the 6th digit
# to account for the possibility that the exact value
# computed by `std()` is not 0
if exclude_zero_sd is True:
feature_sd = df_filter[column].std()
if np.isclose(feature_sd, 0, atol=1e-07):
self.logger.info(
f"Feature {column} was excluded from the model "
f"because its standard deviation in the "
f"training set is equal to 0."
)
drop_column = True
# if `drop_column` is true, then we need to drop the column
if drop_column:
df_filter = df_filter.drop(column, axis=1)
df_exclude = df_exclude.drop(column, axis=1)
# return the filtered rows and the new data frame
return (df_filter, df_exclude)
[docs]
def process_predictions(
self,
df_test_predictions: pd.DataFrame,
train_predictions_mean: float,
train_predictions_sd: float,
human_labels_mean: float,
human_labels_sd: float,
trim_min: float,
trim_max: float,
trim_tolerance: float = 0.4998,
) -> pd.DataFrame:
"""
Process predictions to create scaled, trimmed and rounded predictions.
Parameters
----------
df_test_predictions : pandas.DataFrame
Data frame containing the test set predictions.
train_predictions_mean : float
The mean of the predictions on the training set.
train_predictions_sd : float
The std. dev. of the predictions on the training set.
human_labels_mean : float
The mean of the human scores used to train the model.
human_labels_sd : float
The std. dev. of the human scores used to train the model.
trim_min : float
The lowest score on the score point, used for trimming the raw
regression predictions.
trim_max : float
The highest score on the score point, used for trimming the raw
regression predictions.
trim_tolerance: float
Tolerance to be added to trim_max and substracted from trim_min.
Defaults to 0.4998.
Returns
-------
df_pred_processed : pandas.DataFrame
Data frame containing the various trimmed and rounded predictions.
"""
# rescale the test set predictions by boosting
# them to match the human mean and SD
scaled_test_predictions = (
df_test_predictions["raw"] - train_predictions_mean
) / train_predictions_sd
scaled_test_predictions = scaled_test_predictions * human_labels_sd + human_labels_mean
df_pred_process = df_test_predictions.copy()
df_pred_process["scale"] = scaled_test_predictions
# trim and round the predictions before running the analyses
df_pred_process["raw_trim"] = self.trim(
df_pred_process["raw"], trim_min, trim_max, trim_tolerance
)
df_pred_process["raw_trim_round"] = np.rint(df_pred_process["raw_trim"])
df_pred_process["raw_trim_round"] = df_pred_process["raw_trim_round"].astype("int64")
df_pred_process["scale_trim"] = self.trim(
df_pred_process["scale"], trim_min, trim_max, trim_tolerance
)
df_pred_process["scale_trim_round"] = np.rint(df_pred_process["scale_trim"])
df_pred_process["scale_trim_round"] = df_pred_process["scale_trim_round"].astype("int64")
return df_pred_process
[docs]
def filter_on_flag_columns(
self, df: pd.DataFrame, flag_column_dict: Dict[str, Any]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Filter based on specific flag columns.
Check that all flag_columns are present in the given
data frame, convert these columns to strings and filter
out the values which do not match the condition in
``flag_column_dict``.
Parameters
----------
df : pandas.DataFrame
The DataFrame to filter on.
flag_column_dict : Dict[str, Any]
Dictionary containing the flag column information.
Returns
-------
df_responses_with_requested_flags : pandas.DataFrame
Data frame containing the responses remaining after filtering using
the specified flag columns.
df_responses_with_excluded_flags : pandas.DataFrame
Data frame containing the responses filtered out using the specified
flag columns.
Raises
------
KeyError
If the columns listed in the dictionary are not actually present in
the data frame.
ValueError
If no responses remain after filtering based on the flag column
information.
"""
df = df.copy()
flag_columns = list(flag_column_dict.keys())
if not flag_columns:
return df.copy(), pd.DataFrame(columns=df.columns)
else:
# check that all columns are present
missing_flag_columns = set(flag_columns).difference(df.columns)
if missing_flag_columns:
raise KeyError(
f"The data does not contain columns for all flag columns "
f"specified in the configuration file. Please check for "
f"capitalization and other spelling errors and make sure "
f"the flag column names do not contain hyphens. The data "
f"does not have the following "
f"columns: {', '.join(missing_flag_columns)}"
)
# since flag column may be a mix of strings and numeric values
# we convert all strings and integers to floats such that, for
# example, “1”, 1, and “1.0" all map to 1.0. To do this, we will
# first convert all the strings to numbers and then convert
# all the integers to floats.
flag_column_dict_to_float = {
key: list(map(convert_to_float, value)) for (key, value) in flag_column_dict.items()
}
# and now convert the values in the feature column
# in the data frame
df_new = df[flag_columns].copy()
# TODO: remove this when dropping python 3.8 support
# use `applymap()` for python 3.8 since `map()` was
# added in pandas 2.1.0 which is only for python 3.9+
if version_info[0] == 3 and version_info[1] == 8:
df_new = df_new.applymap(convert_to_float)
else:
df_new = df_new.map(convert_to_float)
# identify responses with values which satisfy the condition
full_mask = df_new.isin(flag_column_dict_to_float)
flag_mask = full_mask[list(flag_column_dict_to_float.keys())].all(1)
# return the columns from the original frame that was passed in
# so that all data types remain the same and are not changed
df_responses_with_requested_flags = df[flag_mask].copy()
df_responses_with_excluded_flags = df[~flag_mask].copy()
# make sure that the remaining data frame is not empty
if len(df_responses_with_requested_flags) == 0:
raise ValueError(
"No responses remaining after filtering "
"on flag columns. No further analysis can "
"be run."
)
# reset the index
df_responses_with_requested_flags.reset_index(drop=True, inplace=True)
df_responses_with_excluded_flags.reset_index(drop=True, inplace=True)
return (df_responses_with_requested_flags, df_responses_with_excluded_flags)
[docs]
def generate_feature_names(
self,
df: pd.DataFrame,
reserved_column_names: List[str],
feature_subset_specs: pd.DataFrame,
feature_subset: Optional[str],
) -> List[str]:
"""
Generate feature names from column names in data frame.
This method also selects the specified subset of features.
Parameters
----------
df : pandas.DataFrame
The data frame from which to generate feature names.
reserved_column_names : List[str]
Names of reserved columns.
feature_subset_specs : pandas.DataFrame
Feature subset specifications.
feature_subset : Optional[str]
Feature subset column.
Returns
-------
feature_names : List[str]
List of generated features names.
"""
df = df.copy()
# Exclude the reserved names
possible_feature_names = [
cname for cname in df.columns if cname not in reserved_column_names
]
# Select the features by subset.
# In the future, we may add option to select
# by other methods, if needed.
if feature_subset is not None:
feature_names = FeatureSubsetProcessor().select_by_subset(
possible_feature_names, feature_subset_specs, feature_subset
)
else:
feature_names = possible_feature_names
return feature_names
[docs]
def preprocess_feature(
self,
values: np.ndarray,
feature_name: str,
feature_transform: str,
feature_mean: float,
feature_sd: float,
exclude_zero_sd: bool = False,
raise_error: bool = True,
truncations: Optional[pd.DataFrame] = None,
truncate_outliers: bool = True,
) -> np.ndarray:
"""
Remove outliers and transform the values in given numpy array.
Use the given outlier and transformation parameters.
Parameters
----------
values : numpy.ndarray
The feature values to preprocess.
feature_name : str
Name of the feature being pre-processed.
feature_transform : str
Name of the transformation function to apply.
feature_mean : float
Mean value to use for outlier detection instead of the mean of the
given feature values.
feature_sd : float
Std. dev. value to use for outlier detection instead of the std.
dev. of the given feature values.
exclude_zero_sd : bool
Exclude the feature if it has zero standard deviation.
Defaults to ``False``.
raise_error : bool
Raise an error if any of the transformations lead to "inf" values
or may change the ranking of feature values.
Defaults to ``True``.
truncations : Optional[pandas.DataFrame]
Set of pre-defined truncation values.
Defaults to ``None``.
truncate_outliers : bool
Whether to truncate outlier values.
Defaults to ``True``.
Returns
-------
transformed_feature : numpy.ndarray
Numpy array containing the transformed and clamped
feature values.
Raises
------
ValueError
If the preprocessed feature values have zero standard deviation
and ``exclude_zero_sd`` is set to ``True``.
"""
if truncate_outliers:
if truncations is not None:
# clamp outlier values using the truncations set
features_no_outliers = self.remove_outliers_using_truncations(
values, feature_name, truncations
)
else:
# clamp any outlier values that are 4 standard deviations
# away from the mean
features_no_outliers = self.remove_outliers(
values, mean=feature_mean, sd=feature_sd
)
else:
features_no_outliers = values
# apply the requested transformation to the feature
transformed_feature = FeatureTransformer().transform_feature(
features_no_outliers,
feature_name,
feature_transform,
raise_error=raise_error,
)
# check the standard deviation of the transformed feature
# we set ddof to 1 so that np.std gave the same result as pandas .std
# we also set the tolerance limit to account for cases where std
# is computed as a very low decimal rather than 0
# We only do this for the training set.
if exclude_zero_sd:
feature_sd = np.std(transformed_feature, ddof=1)
if np.isclose(feature_sd, 0, atol=1e-07):
raise ValueError(
f"The standard deviation for feature {feature_name} "
f"is 0 after pre-processing. Please exclude this "
f"feature and re-run the experiment."
)
return transformed_feature
[docs]
def preprocess_features(
self,
df_train: pd.DataFrame,
df_test: pd.DataFrame,
df_feature_specs: pd.DataFrame,
standardize_features: bool = True,
use_truncations: bool = False,
truncate_outliers: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Preprocess features in given data using corresponding specifications.
Preprocess the feature values in the training and testing data
frames whose specifications are contained in ``df_feature_specs``.
Also returns a third data frame containing the feature specifications
and other information.
Parameters
----------
df_train : pandas.DataFrame
Data frame containing the raw feature values for the training set.
df_test : pandas.DataFrame
Data frame containing the raw feature values for the test set.
df_feature_specs : pandas.DataFrame
Data frame containing the various specifications from the feature file.
standardize_features : bool
Whether to standardize the features.
Defaults to ``True``.
truncate_outliers : bool
Truncate outlier values if set in the configuration file.
Defaults to ``True``.
use_truncations : bool
Whether we should use the truncation set for removing outliers.
Defaults to ``False``.
Returns
-------
df_train_preprocessed : pandas.DataFrame
Data frame with preprocessed training data.
df_test_preprocessed : pandas.DataFrame
Data frame with preprocessed test data.
df_feature_info : pandas.DataFrame
Data frame with feature information.
"""
# keep the original data frames and make copies
# that only include features used in the model
df_train_preprocessed = df_train.copy()
df_test_preprocessed = df_test.copy()
# we also need to create a data frame that includes
# all relevant information about each feature
df_feature_info = pd.DataFrame()
# make feature the index of df_feature_specs
df_feature_specs.index = df_feature_specs["feature"]
# if we are should be using truncations, then we create the truncations
# set from the feature specifications
if use_truncations:
truncations = df_feature_specs[["feature", "min", "max"]].set_index("feature")
else:
truncations = None
# now iterate over each feature
for feature_name in df_feature_specs["feature"]:
feature_transformation = df_feature_specs.at[feature_name, "transform"]
feature_sign = df_feature_specs.at[feature_name, "sign"]
train_feature_mean = df_train[feature_name].mean()
train_feature_sd = df_train[feature_name].std()
training_feature_values = df_train[feature_name].values
df_train_preprocessed[feature_name] = self.preprocess_feature(
training_feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd,
exclude_zero_sd=True,
truncations=truncations,
truncate_outliers=truncate_outliers,
)
testing_feature_values = df_test[feature_name].values
df_test_preprocessed[feature_name] = self.preprocess_feature(
testing_feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd,
truncations=truncations,
truncate_outliers=truncate_outliers,
)
# Standardize the features using the mean and sd computed on the
# training set. These are computed separately because we need to
# get the mean of transformed feature before standardization.
train_transformed_mean = df_train_preprocessed[feature_name].mean()
train_transformed_sd = df_train_preprocessed[feature_name].std()
if standardize_features:
df_train_without_mean = df_train_preprocessed[feature_name] - train_transformed_mean
df_train_preprocessed[feature_name] = df_train_without_mean / train_transformed_sd
df_test_without_mean = df_test_preprocessed[feature_name] - train_transformed_mean
df_test_preprocessed[feature_name] = df_test_without_mean / train_transformed_sd
# Multiply both train and test feature by sign.
df_train_preprocessed[feature_name] = df_train_preprocessed[feature_name] * feature_sign
df_test_preprocessed[feature_name] = df_test_preprocessed[feature_name] * feature_sign
# update the feature preprocessing metadata frame
df_feature = pd.DataFrame(
[
{
"feature": feature_name,
"transform": feature_transformation,
"sign": feature_sign,
"train_mean": train_feature_mean,
"train_sd": train_feature_sd,
"train_transformed_mean": train_transformed_mean,
"train_transformed_sd": train_transformed_sd,
}
]
)
df_feature_info = pd.concat([df_feature_info, df_feature])
# reset the index for the feature metadata frame
# since we built it up row by row
df_feature_info = df_feature_info.reset_index().drop("index", axis=1)
# return the three data frames
return (df_train_preprocessed, df_test_preprocessed, df_feature_info)
[docs]
def filter_data(
self,
df: pd.DataFrame,
label_column: str,
id_column: str,
length_column: Optional[str],
second_human_score_column: Optional[str],
candidate_column: str,
requested_feature_names: List[str],
reserved_column_names: List[str],
given_trim_min: Optional[float],
given_trim_max: Optional[float],
flag_column_dict: Dict[str, Any],
subgroups: List[str],
exclude_zero_scores: bool = True,
exclude_zero_sd: bool = False,
feature_subset_specs: Optional[pd.DataFrame] = None,
feature_subset: Optional[str] = None,
min_candidate_items: Optional[int] = None,
use_fake_labels: bool = False,
) -> Tuple[
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
float,
float,
List[str],
]:
"""
Filter rows with zero/non-numeric values for ``label_column``.
Check whether any features that are specifically requested in
``requested_feature_names`` are missing from the data. If no
feature names are requested, the feature list is generated based
on column names and subset information, if available. The function
then excludes non-numeric values for any feature. It will also exclude
zero scores if ``exclude_zero_scores`` is ``True``. If the user
requested to exclude candidates with less than ``min_candidate_items``,
such candidates are also excluded.
It also generates fake labels between 1 and 10 if
``use_fake_parameters`` is ``True``. Finally, it renames the ID
and label columns and splits the data into: (a) data frame with
feature values and scores (b) data frame with information about
subgroup and candidate (metadata) and (c) the data frame with all other
columns.
Parameters
----------
df : pandas.DataFrame
The data frame to filter.
label_column : str
The label column in the data.
id_column : str
The ID column in the data.
length_column : Optional[str]
The length column in the data.
second_human_score_column : Optional[str]
The second human score column in the data.
candidate_column : str
The candidate column in the data.
requested_feature_names : List[str]
A list of requested feature names.
reserved_column_names : List[str]
A list of reserved column names.
given_trim_min : Optional[float]
The minimum trim value.
given_trim_max : Optional[float]
The maximum trim value.
flag_column_dict : Dict[str, Any]
A dictionary of flag columns.
subgroups : List[str]
List containing subgroup names.
exclude_zero_scores : bool
Whether to exclude zero scores.
Defaults to ``True``.
exclude_zero_sd : bool
Whether to exclude zero standard deviation.
Defaults to ``False``.
feature_subset_specs : Optional[pandas.DataFrame]
The data frame containing the feature subset specifications.
Defaults to ``None``.
feature_subset : Optional[str]
The feature subset group (e.g. 'A').
Defaults to ``None``.
min_candidate_items : Optional[int]
The minimum number of items needed to include candidate.
Defaults to ``None``.
use_fake_labels : bool
Whether to use fake labels.
Defaults to ``False``.
Returns
-------
df_filtered_features : pandas.DataFrame
Data frame with filtered features.
df_filtered_metadata : pandas.DataFrame
Data frame with filtered metadata.
df_filtered_other_columns : pandas.DataFrame
Data frame with other columns filtered.
df_excluded : pandas.DataFrame
Data frame with excluded records.
df_filtered_length : pandas.DataFrame
Data frame with length column(s) filtered.
df_filtered_human_scores : pandas.DataFrame
Data frame with human scores filtered.
df_responses_with_excluded_flags : pandas.DataFrame
Data frame containing responses with excluded flags.
trim_min : float
The maximum trim value.
trim_max : float
The minimum trim value.
feature_names : List[str]
A list of feature names.
"""
# make sure that the columns specified in the
# config file actually exist
columns_to_check = [id_column, label_column]
if length_column:
columns_to_check.append(length_column)
if second_human_score_column:
columns_to_check.append(second_human_score_column)
if candidate_column:
columns_to_check.append(candidate_column)
missing_columns = set(columns_to_check).difference(df.columns)
if missing_columns:
raise KeyError(
f"Columns {missing_columns} from the config file do not exist in the data."
)
# it is possible for the `id_column` and `candidate_column` to be
# set to the same column name in the CSV file, e.g., if there is
# only one response per candidate. If this happens, we neeed to
# create a duplicate column for candidates or id for the downstream
# processing to work as usual.
if id_column == candidate_column:
# if the name for both columns is `candidate`, we need to
# create a separate id_column name
if id_column == "candidate":
df["spkitemid"] = df["candidate"].copy()
id_column = "spkitemid"
# else we create a separate `candidate` column
else:
df["candidate"] = df[id_column].copy()
candidate_column = "candidate"
df = self.rename_default_columns(
df,
requested_feature_names,
id_column,
label_column,
second_human_score_column,
length_column,
None,
candidate_column,
)
# check that the id_column contains unique values
if df["spkitemid"].size != df["spkitemid"].unique().size:
raise ValueError(
f"The data contains duplicate response IDs in '{id_column}'. "
f"Please make sure all response IDs are unique and re-run the tool."
)
# Generate feature names if no specific features were requested by the user
if len(requested_feature_names) == 0:
feature_names = self.generate_feature_names(
df,
reserved_column_names,
feature_subset_specs=feature_subset_specs,
feature_subset=feature_subset,
)
else:
feature_names = requested_feature_names
# make sure that feature names do not contain reserved column names
illegal_feature_names = set(feature_names).intersection(reserved_column_names)
if illegal_feature_names:
raise ValueError(
f"The following reserved column names cannot be used as "
f"feature names: '{', '.join(illegal_feature_names)}'. "
f"Please rename these columns and re-run the experiment."
)
# check to make sure that the subgroup columns are all present
df = self.check_subgroups(df, subgroups)
# filter out the responses based on flag columns
(
df_responses_with_requested_flags,
df_responses_with_excluded_flags,
) = self.filter_on_flag_columns(df, flag_column_dict)
# filter out the rows that have non-numeric or zero labels
# unless we are going to generate fake labels in the first place
if not use_fake_labels:
(df_filtered, df_excluded) = self.filter_on_column(
df_responses_with_requested_flags,
"sc1",
exclude_zeros=exclude_zero_scores,
)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError(
"No responses remaining after filtering out "
"non-numeric human scores. No further analysis "
"can be run. "
)
trim_min = given_trim_min if given_trim_min else df_filtered["sc1"].min()
trim_max = given_trim_max if given_trim_max else df_filtered["sc1"].max()
else:
df_filtered = df_responses_with_requested_flags.copy()
trim_min = given_trim_min if given_trim_min else 1
trim_max = given_trim_max if given_trim_max else 10
self.logger.info(f"Generating labels randomly from [{trim_min}, {trim_max}]")
randgen = RandomState(seed=1234567890)
df_filtered[label_column] = randgen.random_integers(
trim_min, trim_max, size=len(df_filtered)
)
# make sure there are no missing features in the data
missing_features = set(feature_names).difference(df_filtered.columns)
if not missing_features:
# make sure all features selected for model building are numeric
# and also replace any non-numeric feature values in already
# excluded data with NaNs for consistency
for feat in feature_names:
df_excluded[feat] = pd.to_numeric(df_excluded[feat], errors="coerce").astype(float)
newdf, newdf_excluded = self.filter_on_column(
df_filtered,
feat,
exclude_zeros=False,
exclude_zero_sd=exclude_zero_sd,
)
del df_filtered
df_filtered = newdf
with np.errstate(divide="ignore"):
df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError(
"No responses remaining after filtering "
"out non-numeric feature values. No further "
"analysis can be run."
)
# Raise warning if we excluded features that were
# specified in the .json file because sd == 0.
omitted_features = set(requested_feature_names).difference(df_filtered.columns)
if omitted_features:
raise ValueError(
f"The following requested features were excluded "
f"because their standard deviation on the training "
f"set was 0: {', '.join(omitted_features)}.\nPlease "
f"edit the feature file to exclude these features "
f"and re-run the tool"
)
# Update the feature names
feature_names = [feature for feature in feature_names if feature in df_filtered]
else:
raise KeyError(
f"DataFrame does not contain columns for all features "
f"specified in the feature file. Please check for "
f"capitalization and other spelling errors and make "
f"sure the feature names do not contain hyphens. "
f"The data does not have columns for the following "
f"features: {', '.join(missing_features)}"
)
# if ``length_column`` exists, make sure it's converted to numeric;
# values that cannot be coerced to numeric will be set to ``np.nan``
if length_column:
df_filtered["length"] = pd.to_numeric(df_filtered["length"], errors="coerce")
# check the values for length column. We do this after filtering
# to make sure we have removed responses that have not been
# processed correctly. Else rename length column to
# ##ORIGINAL_NAME##.
if length_column and (
len(df_filtered[df_filtered["length"].isnull()]) != 0
or df_filtered["length"].std() <= 0
):
self.logger.warning(
f"The {length_column} column either has missing values "
f"or a standard deviation <= 0. No length-based analysis "
f"will be provided. The column will be renamed as "
f"##{length_column}## and saved in *train_other_columns.csv."
)
df_filtered.rename(columns={"length": f"##{length_column}##"}, inplace=True)
# if requested, exclude the candidates with less than X responses
# left after filtering
if min_candidate_items:
(df_filtered_candidates, df_excluded_candidates) = self.select_candidates(
df_filtered, min_candidate_items
)
# check that there are still responses left for analysis
if len(df_filtered_candidates) == 0:
raise ValueError(
f"After filtering non-numeric scores and non-numeric "
f"feature values there were no candidates with "
f"{min_candidate_items} or more responses left "
f"for analysis."
)
# redefine df_filtered
df_filtered = df_filtered_candidates.copy()
# update df_excluded
df_excluded = pd.concat([df_excluded, df_excluded_candidates], sort=True)
# create separate data frames for features and sc1, all other
# information, and responses excluded during filtering
not_other_columns = set()
feature_columns = ["spkitemid", "sc1"] + feature_names
df_filtered_features = df_filtered[feature_columns]
not_other_columns.update(feature_columns)
metadata_columns = ["spkitemid"] + subgroups
if candidate_column:
metadata_columns.append("candidate")
df_filtered_metadata = df_filtered[metadata_columns]
not_other_columns.update(metadata_columns)
df_filtered_length = pd.DataFrame()
length_columns = ["spkitemid", "length"]
if length_column and "length" in df_filtered:
df_filtered_length = df_filtered[length_columns]
not_other_columns.update(length_columns)
df_filtered_human_scores = pd.DataFrame()
human_score_columns = ["spkitemid", "sc1", "sc2"]
if second_human_score_column and "sc2" in df_filtered:
df_filtered_human_scores = df_filtered[human_score_columns].copy()
not_other_columns.update(["sc2"])
# filter out any non-numeric value rows
# as well as zeros, if we were asked to
df_filtered_human_scores["sc2"] = pd.to_numeric(
df_filtered_human_scores["sc2"], errors="coerce"
).astype(float)
if exclude_zero_scores:
df_filtered_human_scores["sc2"] = df_filtered_human_scores["sc2"].replace(0, np.nan)
# we need to make sure that `spkitemid` is the first column
df_excluded = df_excluded[
["spkitemid"] + [column for column in df_excluded if column != "spkitemid"]
]
# now extract all other columns and add 'spkitemid'
other_columns = ["spkitemid"] + [
column for column in df_filtered if column not in not_other_columns
]
df_filtered_other_columns = df_filtered[other_columns]
return (
df_filtered_features,
df_filtered_metadata,
df_filtered_other_columns,
df_excluded,
df_filtered_length,
df_filtered_human_scores,
df_responses_with_excluded_flags,
trim_min,
trim_max,
feature_names,
)
[docs]
def process_data_rsmeval(
self, config_obj: Configuration, data_container_obj: DataContainer
) -> Tuple[Configuration, DataContainer]:
"""
Set up rsmeval experiment by loading & preprocessing evaluation data.
This function takes a configuration object and a container object
as input and returns the same types of objects as output after
the loading, normalizing, and preprocessing.
Parameters
----------
config_obj : Configuration
The configuration object.
data_container_obj : DataContainer
The data container object.
Returns
-------
config_obj : Configuration
New configuration object containing the updated rsmeval configuration.
data_container : DataContainer
New data container object containing the preprocessed data.
Raises
------
KeyError
If columns specified in the configuration do not exist in the
predictions file.
ValueError
If the columns containing the human scores and the system
scores in the predictions file have the same name.
ValueError
If the columns containing the first set of human scores
and the second set of human scores in the predictions file
have the same name.
ValueError
If the predictions file contains the same response ID more than
once.
ValueError
No responses were left after filtering out zero or non-numeric
values for the various columns.
"""
# get the directory where the config file lives
configpath = config_obj.configdir
pred_file_location = DataReader.locate_files(config_obj["predictions_file"], configpath)[0]
# get the column name for the labels for the training and testing data
human_score_column = config_obj["human_score_column"]
system_score_column = config_obj["system_score_column"]
# if the human score column is the same as the
# system score column, raise an error
if human_score_column == system_score_column:
raise ValueError(
"'human_score_column' and " "'system_score_column' " "cannot have the same value."
)
# get the name of the optional column that
# contains the second human score
second_human_score_column = config_obj["second_human_score_column"]
# if the human score column is the same as the
# second human score column, raise an error
if human_score_column == second_human_score_column:
raise ValueError(
"'human_score_column' and "
"'second_human_score_column' "
"cannot have the same value."
)
# get the column name that will hold the ID for
# both the training and the test data
id_column = config_obj["id_column"]
# get the specified trim min and max, if any
# and make sure they are valid numbers and not None
(
spec_trim_min,
spec_trim_max,
spec_trim_tolerance,
) = config_obj.get_trim_min_max_tolerance()
# this should never happen since `trim_min` and `trim_max` are required
# fields for rsmeval and `trim_tolerance` always has a default value
# but we need to check it here to satisfy mypy
assert (
spec_trim_min is not None
and spec_trim_max is not None
and spec_trim_tolerance is not None
)
# get the subgroups if any
subgroups = config_obj.get("subgroups")
# get the candidate column if any and convert it to string
candidate_column = config_obj["candidate_column"]
# check if we are excluding candidates based on number of responses
exclude_listwise = config_obj.check_exclude_listwise()
min_items_per_candidate = config_obj["min_items_per_candidate"]
general_report_sections = config_obj["general_sections"]
# get any special sections that the user might have specified
# get any custom sections and locate them to make sure
# that they exist, otherwise raise an exception
custom_report_section_paths = config_obj["custom_sections"]
if custom_report_section_paths:
self.logger.info("Locating custom report sections")
custom_report_sections = Reporter.locate_custom_sections(
custom_report_section_paths, configpath
)
else:
custom_report_sections = []
section_order = config_obj["section_order"]
# check all sections values and order and get the
# ordered list of notebook files
chosen_notebook_files = Reporter().get_ordered_notebook_files(
general_report_sections,
custom_report_sections,
section_order,
subgroups,
model_type=None,
context="rsmeval",
)
# are we excluding zero scores?
exclude_zero_scores = config_obj["exclude_zero_scores"]
# if we are excluding zero scores but trim_min
# is set to 0, then we need to warn the user
if exclude_zero_scores and spec_trim_min == 0:
self.logger.warning(
"'exclude_zero_scores' is set to True but "
" 'trim_min' is set to 0. This may cause "
" unexpected behavior."
)
# are we filtering on any other columns?
flag_column_dict = config_obj.check_flag_column(partition="test")
# do we have the training set predictions and human scores CSV file
scale_with = config_obj.get("scale_with")
# use scaled predictions for the analyses unless
# we were told not to
use_scaled_predictions = scale_with is not None
# log an appropriate message
if scale_with is None:
message = "Assuming given system predictions " "are unscaled and will be used as such."
elif scale_with == "asis":
message = (
"Assuming given system predictions " "are already scaled and will be used as such."
)
else:
message = (
"Assuming given system predictions " "are unscaled and will be scaled before use."
)
self.logger.info(message)
df_pred = data_container_obj["predictions"]
# make sure that the columns specified in the config file actually exist
columns_to_check = [id_column, human_score_column, system_score_column]
if second_human_score_column:
columns_to_check.append(second_human_score_column)
if candidate_column:
columns_to_check.append(candidate_column)
missing_columns = set(columns_to_check).difference(df_pred.columns)
if missing_columns:
raise KeyError(
f"Columns {missing_columns} from the config file do not "
f"exist in the predictions file."
)
df_pred = self.rename_default_columns(
df_pred,
[],
id_column,
human_score_column,
second_human_score_column,
None,
system_score_column,
candidate_column,
)
# check that the id_column contains unique values
if df_pred["spkitemid"].size != df_pred["spkitemid"].unique().size:
raise ValueError(
f"The data contains duplicate response IDs in '{id_column}'. "
f"Please make sure all response IDs are unique and re-run the tool."
)
df_pred = self.check_subgroups(df_pred, subgroups)
# filter out the responses based on flag columns
(
df_responses_with_requested_flags,
df_responses_with_excluded_flags,
) = self.filter_on_flag_columns(df_pred, flag_column_dict)
# filter out rows that have non-numeric or zero human scores
df_filtered, df_excluded = self.filter_on_column(
df_responses_with_requested_flags,
"sc1",
exclude_zeros=exclude_zero_scores,
)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError(
"No responses remaining after filtering out "
"non-numeric human scores. No further analysis "
"can be run. "
)
# Change all non-numeric machine scores in excluded
# data to NaNs for consistency with rsmtool.
# NOTE: This will *not* work if *all* of the values
# in column are non-numeric. This is a known bug in
# pandas: https://github.com/pydata/pandas/issues/9589
# Therefore, we need add an additional check after this.
df_excluded["raw"] = pd.to_numeric(df_excluded["raw"], errors="coerce").astype(float)
# filter out the non-numeric machine scores from the rest of the data
newdf, newdf_excluded = self.filter_on_column(df_filtered, "raw", exclude_zeros=False)
del df_filtered
df_filtered_pred = newdf
# make sure that the remaining data frame is not empty
if len(df_filtered_pred) == 0:
raise ValueError(
"No responses remaining after filtering out "
"non-numeric machine scores. No further analysis "
"can be run. "
)
# concatenate the excluded data frames but ignore the
# FutureWarning about the behavior of Dataframe concatenation
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="The behavior of Dataframe concatenation",
category=FutureWarning,
)
df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
# if requested, exclude the candidates with less than X responses
# left after filtering
if exclude_listwise:
(df_filtered_candidates, df_excluded_candidates) = self.select_candidates(
df_filtered_pred, min_items_per_candidate
)
# check that there are still responses left for analysis
if len(df_filtered_candidates) == 0:
raise ValueError(
f"After filtering non-numeric human and system scores "
f"there were no candidates with {str(min_items_per_candidate)} "
f"or more responses left for analysis."
)
# redefine df_filtered_pred
df_filtered_pred = df_filtered_candidates.copy()
# update df_excluded but ignore the FutureWarning about the
# behavior of Dataframe concatenation
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="The behavior of Dataframe concatenation",
category=FutureWarning,
)
df_excluded = pd.concat([df_excluded, df_excluded_candidates], sort=True)
# make sure that `spkitemid` is the first column
df_excluded = df_excluded[
["spkitemid"] + [column for column in df_excluded if column != "spkitemid"]
]
# set default values for scaling
scale_pred_mean = 0
scale_pred_sd = 1
scale_human_mean = 0
scale_human_sd = 1
if data_container_obj.get_frame("scale") is not None:
if (
"sc1" not in data_container_obj["scale"].columns
and "prediction" not in data_container_obj["scale"].columns
):
raise KeyError(
"The CSV file specified for scaling ",
'must have the "prediction" and the "sc1" ' "columns.",
)
else:
scale_pred_mean, scale_pred_sd = (
data_container_obj["scale"]["prediction"].mean(),
data_container_obj["scale"]["prediction"].std(),
)
scale_human_mean, scale_human_sd = (
data_container_obj["scale"]["sc1"].mean(),
data_container_obj["scale"]["sc1"].std(),
)
self.logger.info("Processing predictions")
df_pred_processed = self.process_predictions(
df_filtered_pred,
scale_pred_mean,
scale_pred_sd,
scale_human_mean,
scale_human_sd,
spec_trim_min,
spec_trim_max,
spec_trim_tolerance,
)
if not scale_with:
expected_score_types = ["raw", "raw_trim", "raw_trim_round"]
elif scale_with == "asis":
expected_score_types = ["scale", "scale_trim", "scale_trim_round"]
else:
expected_score_types = [
"raw",
"raw_trim",
"raw_trim_round",
"scale",
"scale_trim",
"scale_trim_round",
]
# extract separated data frames that we will write out
# as separate files
not_other_columns = set()
prediction_columns = ["spkitemid", "sc1"] + expected_score_types
df_predictions_only = df_pred_processed[prediction_columns]
not_other_columns.update(prediction_columns)
metadata_columns = ["spkitemid"] + subgroups
if candidate_column:
metadata_columns.append("candidate")
df_test_metadata = df_filtered_pred[metadata_columns]
not_other_columns.update(metadata_columns)
df_test_human_scores = pd.DataFrame()
human_score_columns = ["spkitemid", "sc1", "sc2"]
if second_human_score_column and "sc2" in df_filtered_pred:
df_test_human_scores = df_filtered_pred[human_score_columns].copy()
not_other_columns.update(["sc2"])
# filter out any non-numeric values nows
# as well as zeros, if we were asked to
df_test_human_scores["sc2"] = pd.to_numeric(
df_test_human_scores["sc2"], errors="coerce"
).astype(float)
if exclude_zero_scores:
df_test_human_scores["sc2"] = df_test_human_scores["sc2"].replace(0, np.nan)
# remove 'spkitemid' from `not_other_columns`
# because we want that in the other columns
# data frame
not_other_columns.remove("spkitemid")
# extract all of the other columns in the predictions file
other_columns = [
column for column in df_filtered_pred.columns if column not in not_other_columns
]
df_pred_other_columns = df_filtered_pred[other_columns]
# add internal configuration options that we need
new_config_obj = config_obj.copy()
internal_options_dict = {
"pred_file_location": pred_file_location,
"exclude_listwise": exclude_listwise,
"use_scaled_predictions": use_scaled_predictions,
"chosen_notebook_files": chosen_notebook_files,
}
for key, value in internal_options_dict.items():
new_config_obj[key] = value
# we need to make sure that `spkitemid` is the first column
df_excluded = df_excluded[
["spkitemid"] + [column for column in df_excluded if column != "spkitemid"]
]
frames = [
df_predictions_only,
df_test_metadata,
df_pred_other_columns,
df_test_human_scores,
df_excluded,
df_responses_with_excluded_flags,
]
names = [
"pred_test",
"test_metadata",
"test_other_columns",
"test_human_scores",
"test_excluded",
"test_responses_with_excluded_flags",
]
new_container_datasets = [
DatasetDict({"name": name, "frame": frame}) for frame, name in zip(frames, names)
]
new_container = DataContainer(datasets=new_container_datasets)
return new_config_obj, new_container
[docs]
def process_data_rsmpredict(
self, config_obj: Configuration, data_container_obj: DataContainer
) -> Tuple[Configuration, DataContainer]:
"""
Process data for rsmpredict experiments.
This function takes a configuration object and a container object
as input and returns the same types of objects as output after
the loading, normalizing, and preprocessing.
Parameters
----------
config_obj : Configuration
The configuration object.
data_container_obj : DataContainer
The data container object.
Returns
-------
config_obj : Configuration
New configuration object containing the updated rsmpredict configuration.
data_congtainer : DataContainer
New data container object containing the preprocessed data.
Raises
------
KeyError
If columns specified in the configuration do not exist in the data.
ValueError
If data contains duplicate response IDs.
"""
df_input = data_container_obj["input_features"]
df_feature_info = data_container_obj["feature_info"]
df_postproc_params = data_container_obj["postprocessing_params"]
# get the column name that will hold the ID
id_column = config_obj["id_column"]
# get the column name for human score (if any)
human_score_column = config_obj["human_score_column"]
# get the column name for second human score (if any)
second_human_score_column = config_obj["second_human_score_column"]
# get the column name for subgroups (if any)
subgroups = config_obj["subgroups"]
# get the model
model = config_obj["model"]
# should features be standardized?
standardize_features = config_obj.get("standardize_features", True)
# should outliers be truncated?
truncate_outliers = config_obj.get("truncate_outliers", True)
# should we predict expected scores
predict_expected_scores = config_obj["predict_expected_scores"]
# get the column names for flag columns (if any)
flag_column_dict = config_obj.check_flag_column(partition="test")
# get the name for the candidate_column (if any)
candidate_column = config_obj["candidate_column"]
# make sure that the columns specified in the config file actually exist
columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())
# add subgroups and the flag columns to the list of columns
# that will be added to the final file
columns_to_copy = subgroups + list(flag_column_dict.keys())
# human_score_column will be set to sc1 by default
# we only raise an error if it's set to something else.
# However, since we cannot distinguish whether the column was set
# to sc1 by default or specified as such in the config file
# we append it to output anyway as long as
# it is in the input file
if human_score_column != "sc1" or "sc1" in df_input.columns:
columns_to_check.append(human_score_column)
columns_to_copy.append("sc1")
if candidate_column:
columns_to_check.append(candidate_column)
columns_to_copy.append("candidate")
if second_human_score_column:
columns_to_check.append(second_human_score_column)
columns_to_copy.append("sc2")
missing_columns = set(columns_to_check).difference(df_input.columns)
if missing_columns:
raise KeyError(
f"Columns {missing_columns} from the config file do not exist in the data."
)
# rename all columns
df_input = self.rename_default_columns(
df_input,
[],
id_column,
human_score_column,
second_human_score_column,
None,
None,
candidate_column=candidate_column,
)
# check that the id_column contains unique values
if df_input["spkitemid"].size != df_input["spkitemid"].unique().size:
raise ValueError(
f"The data contains repeated response IDs in {id_column}. "
f"Please make sure all response IDs are unique and re-run the tool."
)
(df_features_preprocessed, df_excluded) = self.preprocess_new_data(
df_input,
df_feature_info,
standardize_features=standardize_features,
truncate_outliers=truncate_outliers,
)
trim_min = df_postproc_params["trim_min"].values[0]
trim_max = df_postproc_params["trim_max"].values[0]
h1_mean = df_postproc_params["h1_mean"].values[0]
h1_sd = df_postproc_params["h1_sd"].values[0]
# if we are using a newly trained model, use trim_tolerance from the
# df_postproc_params. If not, set it to the default value and show
# warning
if "trim_tolerance" in df_postproc_params:
trim_tolerance = df_postproc_params["trim_tolerance"].values[0]
else:
trim_tolerance = 0.4998
self.logger.warning(
"The tolerance for trimming scores will be assumed to be 0.4998, "
"the default value in previous versions of RSMTool. "
"We recommend re-training the model to ensure future "
"compatibility."
)
# now generate the predictions for the features using this model
logged_str = "Generating predictions"
logged_str += " (expected scores)." if predict_expected_scores else "."
self.logger.info(logged_str)
# compute minimum and maximum score for expected predictions
min_score = int(np.rint(trim_min - trim_tolerance))
max_score = int(np.rint(trim_max + trim_tolerance))
df_predictions = model.predict(
df_features_preprocessed,
min_score,
max_score,
predict_expected=predict_expected_scores,
)
train_predictions_mean = df_postproc_params["train_predictions_mean"].values[0]
train_predictions_sd = df_postproc_params["train_predictions_sd"].values[0]
df_predictions = self.process_predictions(
df_predictions,
train_predictions_mean,
train_predictions_sd,
h1_mean,
h1_sd,
trim_min,
trim_max,
trim_tolerance,
)
# add back the columns that we were requested to copy if any
if len(columns_to_copy) > 0:
df_predictions_with_metadata = pd.merge(
df_predictions, df_input[["spkitemid"] + columns_to_copy]
)
assert len(df_predictions) == len(df_predictions_with_metadata)
else:
df_predictions_with_metadata = df_predictions.copy()
# we need to make sure that `spkitemid` is the first column
df_excluded = df_excluded[
["spkitemid"] + [column for column in df_excluded if column != "spkitemid"]
]
datasets = [
DatasetDict({"name": "features_processed", "frame": df_features_preprocessed}),
DatasetDict({"name": "excluded", "frame": df_excluded}),
DatasetDict(
{
"name": "predictions_with_metadata",
"frame": df_predictions_with_metadata,
}
),
DatasetDict({"name": "predictions", "frame": df_predictions}),
]
return config_obj, DataContainer(datasets=datasets)
[docs]
def process_data_rsmexplain(
self, config_obj: Configuration, data_container_obj: DataContainer
) -> Tuple[Configuration, DataContainer]:
"""
Process data for rsmexplain experiments.
This function takes a configuration object and a container object
as input and returns the same types of objects as output after
the loading, normalizing, and preprocessing.
Parameters
----------
config_obj : Configuration
The configuration object.
data_container_obj : DataContainer
The data container object.
Returns
-------
config_obj : Configuration
New configuration object containing the updated rsmexplain configuration.
data_congtainer : DataContainer
New data container object containing the preprocessed data.
Raises
------
ValueError
If data contains duplicate response IDs.
"""
df_background_features = data_container_obj["background_features"]
df_explain_features = data_container_obj["explain_features"]
df_feature_info = data_container_obj["feature_info"]
# get the column name that holds the IDs
id_column = config_obj["id_column"]
# should features be standardized?
standardize_features = config_obj.get("standardize_features", True)
# should outliers be truncated?
truncate_outliers = config_obj.get("truncate_outliers", True)
# rename the ID columns in both frames
df_background_preprocessed = self.rename_default_columns(
df_background_features,
[],
id_column,
None,
None,
None,
None,
None,
)
df_explain_preprocessed = self.rename_default_columns(
df_explain_features,
[],
id_column,
None,
None,
None,
None,
None,
)
# check that the default ID column in both frames contains unique values
if (
df_background_preprocessed["spkitemid"].size
!= df_background_preprocessed["spkitemid"].unique().size
):
raise ValueError(
f"The background data contains repeated response IDs in {id_column}. "
f"Please make sure all response IDs are unique and re-run the tool."
)
if (
df_explain_preprocessed["spkitemid"].size
!= df_explain_preprocessed["spkitemid"].unique().size
):
raise ValueError(
f"The explain data contains repeated response IDs in {id_column}. "
f"Please make sure all response IDs are unique and re-run the tool."
)
# now pre-process all the features that go into the model
(df_background_preprocessed, _) = self.preprocess_new_data(
df_background_preprocessed,
df_feature_info,
standardize_features=standardize_features,
truncate_outliers=truncate_outliers,
)
(df_explain_preprocessed, _) = self.preprocess_new_data(
df_explain_preprocessed,
df_feature_info,
standardize_features=standardize_features,
truncate_outliers=truncate_outliers,
)
# set ID column as index for the background and explain feature frames
# since we are going to convert these to featuresets in rsmexplain
df_background_preprocessed.set_index("spkitemid", inplace=True)
df_explain_preprocessed.set_index("spkitemid", inplace=True)
# return the container with the pre-processed features
datasets = [
DatasetDict(
{"name": "background_features_preprocessed", "frame": df_background_preprocessed}
),
DatasetDict(
{"name": "explain_features_preprocessed", "frame": df_explain_preprocessed}
),
]
return config_obj, DataContainer(datasets=datasets)
[docs]
def process_data(
self, config_obj: Configuration, data_container_obj: DataContainer, context: str = "rsmtool"
) -> Tuple[Configuration, DataContainer]:
"""
Process and setup the data for an experiment in the given context.
Parameters
----------
config_obj : Configuration
The configuration object.
data_container_obj : DataContainer
The data container object.
context : str
The tool context: one of {"rsmtool", "rsmeval", "rsmpredict"}.
Defaults to "rsmtool".
Returns
-------
config_obj : Configuration
New configuration object containing the updated configuration.
data_container : DataContainer
new data container object containing the preprocessed data.
Raises
------
ValueError
If the context is not one of {"rsmtool", "rsmeval", "rsmpredict"}.
"""
if context == "rsmtool":
return self.process_data_rsmtool(config_obj, data_container_obj)
elif context == "rsmeval":
return self.process_data_rsmeval(config_obj, data_container_obj)
elif context == "rsmpredict":
return self.process_data_rsmpredict(config_obj, data_container_obj)
elif context == "rsmexplain":
return self.process_data_rsmexplain(config_obj, data_container_obj)
else:
raise ValueError(
f"The 'context' argument must be in the set: ['rsmtool', "
f"'rsmeval', 'rsmpredict', 'rsmexplain']. You specified `{context}`."
)
[docs]
def preprocess_new_data(
self,
df_input: pd.DataFrame,
df_feature_info: pd.DataFrame,
standardize_features: bool = True,
truncate_outliers: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Preprocess feature values using the parameters in ``df_feature_info``.
For more details on what these preprocessing parameters are,
see :ref:`documentation <preprocessing_parameters>`.
Parameters
----------
df_input : pandas.DataFrame
Data frame with raw feature values that will be used to generate
the scores. Each feature is stored in a separate column. Each row
corresponds to one response. There should also be a column named
"spkitemid" containing a unique ID for each response.
df_feature_info : pandas.DataFrame
Data frame with preprocessing parameters in the following columns:
- "feature" : the name of the feature; should match the feature names
in ``df_input``.
- "sign" : 1 or -1. Indicates whether the feature value needs to
be multiplied by -1.
- "transform" : :ref:`transformation <select_transformations_rsmtool>`
that needs to be applied to this feature.
- "train_mean", "train_sd" : mean and standard deviation for outlier
truncation.
- "train_transformed_mean", "train_transformed_sd" : mean and standard
deviation for computing z-scores.
standardize_features : bool
Whether the features should be standardized prior to prediction.
Defaults to ``True``.
truncate_outliers : bool
Whether outlier should be truncated prior to prediction.
Defaults to ``True``.
Returns
-------
df_features_preprocessed : pandas.DataFrame
Data frame with processed feature values.
df_excluded: pandas.DataFrame
Data frame with responses excluded from further analysis
due to non-numeric feature values in the original file
or after applying transformations. This data frame always
contains the original feature values.
Raises
------
KeyError
if some of the features specified in ``df_feature_info`` are
not present in ``df_input``.
ValueError
If all responses have at least one non-numeric feature value
and, therefore, no score can be generated for any of the
responses.
"""
# get the list of required features
required_features = df_feature_info.index.tolist()
# ensure that all the features that are needed by the model
# are present in the input file
input_feature_columns = [c for c in df_input if c != "spkitemid"]
missing_features = set(required_features).difference(input_feature_columns)
if missing_features:
raise KeyError(
f"The input feature file is missing the following features: {missing_features}"
)
extra_features = set(input_feature_columns).difference(required_features + ["spkitemid"])
if extra_features:
self.logger.warning(
f"The following extraneous features will be ignored: {extra_features}"
)
# keep the required features plus the id
features_to_keep = ["spkitemid"] + required_features
# check if actually have the human scores for this data and add
# sc1 to preprocessed features for consistency with other tools
has_human_scores = "sc1" in df_input
if has_human_scores:
features_to_keep.append("sc1")
df_features = df_input[features_to_keep]
# preprocess the feature values
self.logger.info("Pre-processing input features")
# first we need to filter out NaNs and any other
# weird features, the same way we did for rsmtool.
df_filtered = df_features.copy()
df_excluded = pd.DataFrame(columns=df_filtered.columns)
for feature_name in required_features:
newdf, newdf_excluded = self.filter_on_column(
df_filtered,
feature_name,
exclude_zeros=False,
exclude_zero_sd=False,
)
del df_filtered
df_filtered = newdf
# concatenate the excluded data frames but ignore the
# FutureWarning about the behavior of Dataframe concatenation
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="The behavior of Dataframe concatenation",
category=FutureWarning,
)
df_excluded = pd.concat([df_excluded, newdf_excluded], sort=True)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError(
"There are no responses left after "
"filtering out non-numeric feature values. No analysis "
"will be run."
)
df_features = df_filtered.copy()
df_features_preprocess = df_features.copy()
for feature_name in required_features:
feature_values = df_features_preprocess[feature_name].values
feature_transformation = df_feature_info.loc[feature_name]["transform"]
feature_sign = df_feature_info.loc[feature_name]["sign"]
train_feature_mean = df_feature_info.loc[feature_name]["train_mean"]
train_feature_sd = df_feature_info.loc[feature_name]["train_sd"]
train_transformed_mean = df_feature_info.loc[feature_name]["train_transformed_mean"]
train_transformed_sd = df_feature_info.loc[feature_name]["train_transformed_sd"]
# transform the feature values and remove outliers
df_features_preprocess[feature_name] = self.preprocess_feature(
feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd,
exclude_zero_sd=False,
raise_error=False,
truncate_outliers=truncate_outliers,
)
# filter the feature values once again to remove possible NaN and inf values that
# might have emerged when applying transformations.
# We do not need to do that if no transformation was applied.
if feature_transformation not in ["raw", "org"]:
# check that there are indeed inf or Nan values
if (
np.isnan(df_features_preprocess[feature_name]).any()
or np.isinf(df_features_preprocess[feature_name]).any()
):
(newdf, newdf_excluded) = self.filter_on_column(
df_features_preprocess,
feature_name,
exclude_zeros=False,
exclude_zero_sd=False,
)
del df_features_preprocess
df_features_preprocess = newdf
# add the response(s) with missing values to the excluded responses
# but make sure we are adding the original values, not the
# preprocessed ones
missing_values = df_features["spkitemid"].isin(newdf_excluded["spkitemid"])
df_excluded_original = df_features[missing_values].copy()
df_excluded = pd.merge(df_excluded, df_excluded_original, how="outer")
# print(standardized_features)
if standardize_features:
# now standardize the feature values
df_feature_minus_mean = (
df_features_preprocess[feature_name] - train_transformed_mean
)
df_features_preprocess[feature_name] = df_feature_minus_mean / train_transformed_sd
# Multiply features by sign.
df_features_preprocess[feature_name] = (
df_features_preprocess[feature_name] * feature_sign
)
# we need to make sure that `spkitemid` is the first column
df_excluded = df_excluded[
["spkitemid"] + [column for column in df_excluded if column != "spkitemid"]
]
return (df_features_preprocess, df_excluded)