Source code for rsmtool.analyzer

"""
Classes for analyzing RSMTool predictions, metrics, etc.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import logging
import warnings
from functools import partial
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from scipy.stats import kurtosis, pearsonr
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score
from skll.metrics import kappa
from wandb.sdk.lib import RunDisabled
from wandb.wandb_run import Run

from .configuration_parser import Configuration
from .container import DataContainer, DatasetDict
from .utils import wandb
from .utils.metrics import (
    agreement,
    difference_of_standardized_means,
    partial_correlations,
    quadratic_weighted_kappa,
    standardized_mean_difference,
)
from .utils.prmse import get_true_score_evaluations



[docs]
class Analyzer:
    """Class to perform analysis on all metrics, predictions, etc."""

    def __init__(self, logger: Optional[logging.Logger] = None):
        """Initialize the Analyzer object."""
        self.logger = logger if logger else logging.getLogger(__name__)


[docs]
    @staticmethod
    def check_frame_names(data_container: DataContainer, dataframe_names: List[str]) -> None:
        """
        Check that all specified dataframes are available.

        This method checks to make sure all specified DataFrames
        are in the given data container object.

        Parameters
        ----------
        data_container : DataContainer
            A DataContainer object
        dataframe_names : List[str]
            The names of the DataFrames expected in the DataContainer object.

        Raises
        ------
        KeyError
            If a given dataframe_name is not in the DataContainer object.
        """
        for dataframe_name in dataframe_names:
            if dataframe_name not in data_container:
                raise KeyError(
                    f"The DataFrame `{dataframe_name}` does not exist "
                    f"in the DataContainer object."
                )



[docs]
    @staticmethod
    def check_param_names(configuration_obj: Configuration, parameter_names: List[str]) -> None:
        """
        Check that all specified parameters are available.

        This method checks to make sure all specified parameters
        are in the given configuration object.

        Parameters
        ----------
        configuration_obj : Configuration
            A configuration object
        parameter_names : List[str]
            The names of the parameters (keys) expected in the
            Configuration object.

        Raises
        ------
        KeyError
            If a given parameter_name is not in the Configuration object.
        """
        for parameter_name in parameter_names:
            if parameter_name not in configuration_obj:
                raise KeyError(
                    f"The parameter `{parameter_name}` does not exist "
                    f"in the Configuration object."
                )



[docs]
    @staticmethod
    def analyze_excluded_responses(
        df: pd.DataFrame,
        features: List[str],
        header: str,
        exclude_zero_scores: bool = True,
        exclude_listwise: bool = False,
    ) -> pd.DataFrame:
        """
        Compute statistics for responses excluded from analyses.

        This method computes various statistics for the responses that
        were excluded from analyses, either in the training set or in
        the test set.

        Parameters
        ----------
        df : pandas.DataFrame
            Data frame containing the excluded responses
        features : List[str]
            List of column names containing the features to which we want to
            restrict the analyses.
        header : str
            String to be used as the table header for the output data frame.
        exclude_zero_scores : bool
            Whether or not the zero-score responses should be counted in the
            exclusion statistics.
            Defaults to ``True``.
        exclude_listwise : bool
            Whether or not the candidates were excluded based on minimal number
            of responses.
            Defaults to ``False``.

        Returns
        -------
        df_full_crosstab : pandas.DataFrame
            Two-dimensional data frame containing the exclusion statistics.
        """
        # create an empty output data frame
        df_full_crosstab = pd.DataFrame(
            {
                "all features numeric": [0, 0, 0],
                "non-numeric feature values": [0, 0, 0],
            },
            index=[
                "numeric non-zero human score",
                "zero human score",
                "non-numeric human score",
            ],
        )

        if not df.empty:
            # re-code human scores into numeric, missing or zero
            df["score_category"] = "numeric non-zero human score"
            df.loc[df["sc1"].isnull(), "score_category"] = "non-numeric human score"
            df.loc[df["sc1"].astype(float) == 0, "score_category"] = "zero human score"

            # recode feature values: a response with at least one
            # missing feature is assigned 'non-numeric feature values'
            df_features_only = df[features + ["spkitemid"]]
            null_feature_rows = df_features_only.isnull().any(axis=1)
            df_null_features = df_features_only[null_feature_rows]
            df["feat_category"] = "all features numeric"
            df.loc[df["spkitemid"].isin(df_null_features["spkitemid"]), "feat_category"] = (
                "non-numeric feature values"
            )

            # crosstabulate
            df_crosstab = pd.crosstab(df["score_category"], df["feat_category"])
            df_full_crosstab.update(df_crosstab)
            # convert back to integers as these are all counts
            df_full_crosstab = df_full_crosstab.astype(int)
            df_full_crosstab.insert(0, header, df_full_crosstab.index)

        if not exclude_listwise:
            # if we are not excluding listwise, set the first cell to None so
            # that it is not set to zero
            assert df_full_crosstab.loc["numeric non-zero human score", "all features numeric"] == 0
            df_full_crosstab.loc["numeric non-zero human score", "all features numeric"] = None

            # if we are not excluding the zeros, set the corresponding cells to None
            # so that they are not set to zero. We do not do this for listwise exclusion
            if not exclude_zero_scores:
                assert df_full_crosstab.loc["zero human score", "all features numeric"] == 0
                df_full_crosstab.loc["zero human score", "all features numeric"] = None

        return df_full_crosstab



[docs]
    @staticmethod
    def analyze_used_responses(
        df_train: pd.DataFrame, df_test: pd.DataFrame, subgroups: List[str], candidate_column: str
    ) -> pd.DataFrame:
        """
        Compute statistics for responses used in analyses.

        This method computes various statistics on the responses that
        were used in analyses, either in the training set or in the
        test set.

        Parameters
        ----------
        df_train : pandas.DataFrame
            Data frame containing the response information for the training set.
        df_test : pandas.DataFrame
            Data frame containing the response information for the test set.
        subgroups : List[str]
            List of column names that contain grouping information.
        candidate_column : str
            Column name that contains candidate identification information.

        Returns
        -------
        df_analysis : pandas.DataFrame
            Data frame containing information about the used responses.
        """
        # create a basic data frame for responses only
        train_responses = set(df_train["spkitemid"])
        test_responses = set(df_test["spkitemid"])

        rows = [
            {"partition": "Training", "responses": len(train_responses)},
            {"partition": "Evaluation", "responses": len(test_responses)},
            {
                "partition": "Overlapping",
                "responses": len(train_responses & test_responses),
            },
            {"partition": "Total", "responses": len(train_responses | test_responses)},
        ]

        df_analysis = pd.DataFrame.from_dict(rows)

        columns = ["partition", "responses"] + subgroups

        if candidate_column:
            train_candidates = set(df_train["candidate"])
            test_candidates = set(df_test["candidate"])
            df_analysis["candidates"] = [
                len(train_candidates),
                len(test_candidates),
                len(train_candidates & test_candidates),
                len(train_candidates | test_candidates),
            ]

            columns = ["partition", "responses", "candidates"] + subgroups

        for group in subgroups:
            train_group = set(df_train[group])
            test_group = set(df_test[group])
            df_analysis[group] = [
                len(train_group),
                len(test_group),
                len(train_group & test_group),
                len(train_group | test_group),
            ]

        df_analysis = df_analysis[columns]
        return df_analysis



[docs]
    @staticmethod
    def analyze_used_predictions(
        df_test: pd.DataFrame, subgroups: List[str], candidate_column: str
    ) -> pd.DataFrame:
        """
        Compute various statistics for predictions used in analyses.

        Parameters
        ----------
        df_test : pandas.DataFrame
            Data frame containing the test set predictions.
        subgroups : List[str]
            List of column names that contain grouping information.
        candidate_column : str
            Column name that contains candidate identification information.

        Returns
        -------
        df_analysis : pandas.DataFrame
            Data frame containing information about the used predictions.
        """
        rows = [{"partition": "Evaluation", "responses": df_test["spkitemid"].size}]

        df_analysis = pd.DataFrame.from_dict(rows)
        df_columns = ["partition", "responses"] + subgroups

        if candidate_column:
            df_analysis["candidates"] = [df_test["candidate"].unique().size]
            df_columns = ["partition", "responses", "candidates"] + subgroups

        for group in subgroups:
            df_analysis[group] = [df_test[group].unique().size]

        df_analysis = df_analysis[df_columns]
        return df_analysis



[docs]
    @staticmethod
    def compute_basic_descriptives(df: pd.DataFrame, selected_features: List[str]) -> pd.DataFrame:
        """
        Compute basic descriptive statistics for columns in the given data frame.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame containing the feature values.
        selected_features : List[str]
            List of feature names for which to compute the descriptives.

        Returns
        -------
        df_desc : pandas.DataFrame
            DataFrame containing the descriptives for each of the features.
        """
        # select only feature columns
        df_desc = df[selected_features]

        # get the H1 scores
        scores = df["sc1"]

        # compute correlations and p-values separately for efficiency
        cor_series = df_desc.apply(lambda s: pearsonr(s, scores))
        cors = cor_series.apply(lambda t: t[0])
        pvalues = cor_series.apply(lambda t: t[1])

        # create a data frame with all the descriptives
        df_output = pd.DataFrame(
            {
                "mean": df_desc.mean(),
                "min": df_desc.min(),
                "max": df_desc.max(),
                "std. dev.": df_desc.std(),
                "skewness": df_desc.skew(),
                "kurtosis": df_desc.apply(lambda s: kurtosis(s, fisher=False)),
                "Correlation": cors,
                "p": pvalues,
                "N": len(df_desc),
            }
        )

        # reorder the columns to make it look better
        df_output = df_output[
            [
                "mean",
                "std. dev.",
                "min",
                "max",
                "skewness",
                "kurtosis",
                "Correlation",
                "p",
                "N",
            ]
        ]

        return df_output



[docs]
    @staticmethod
    def compute_percentiles(
        df: pd.DataFrame, selected_features: List[str], percentiles: Optional[List[int]] = None
    ) -> pd.DataFrame:
        """
        Compute percentiles and outliers for columns in the given data frame.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame containing the feature values.
        selected_features : List[str]
            List of feature names for which to compute the percentile descriptives.
        percentiles : Optional[List[int]
            The percentiles to calculate. If ``None``, use the percentiles
            {1, 5, 25, 50, 75, 95, 99}.
            Defaults to ``None``.

        Returns
        -------
        df_output : pandas.DataFrame
            Data frame containing the percentile information for each of the features.
        """
        # select only feature columns
        df_desc = df[selected_features]

        # compute the various percentile levels
        if percentiles is None:
            percentiles = [1, 5, 25, 50, 75, 95, 99]

        df_output = df_desc.apply(
            lambda series: pd.Series(np.percentile(series, percentiles, method="lower"))
        )
        df_output = df_output.transpose()

        # change the column names to be more readable
        df_output.columns = [f"{p}%" for p in percentiles]

        # add the inter-quartile range column
        df_output["IQR"] = df_output["75%"] - df_output["25%"]

        # compute the various outlier statistics
        mild_upper = df_output["75%"] + 1.5 * df_output["IQR"]
        mild_bottom = df_output["25%"] - 1.5 * df_output["IQR"]

        extreme_upper = df_output["75%"] + 3 * df_output["IQR"]
        extreme_bottom = df_output["25%"] - 3 * df_output["IQR"]

        # compute the mild and extreme outliers
        num_mild_outliers = {}
        num_extreme_outliers = {}
        for c in df_desc.columns:
            is_extreme = (df_desc[c] <= extreme_bottom[c]) | (df_desc[c] >= extreme_upper[c])

            is_mild = (df_desc[c] > extreme_bottom[c]) & (df_desc[c] <= mild_bottom[c])
            is_mild = is_mild | ((df_desc[c] >= mild_upper[c]) & (df_desc[c] < extreme_upper[c]))
            num_mild_outliers[c] = len(df_desc[is_mild])
            num_extreme_outliers[c] = len(df_desc[is_extreme])

        # add those to the output data frame
        df_output["Mild outliers"] = pd.Series(num_mild_outliers)
        df_output["Extreme outliers"] = pd.Series(num_extreme_outliers)

        return df_output



[docs]
    @staticmethod
    def compute_outliers(df: pd.DataFrame, selected_features: List[str]) -> pd.DataFrame:
        """
        Compute number and percentage of outliers for given columns.

        This method computes the number and percentage of outliers
        that lie outside the range mean +/- 4 SD for each of the
        given columns in the given data frame.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame containing the feature values.
        selected_features : List[str]
            List of feature names for which to compute outlier information.

        Returns
        -------
        df_output : pandas.DataFrame
            Data frame containing outlier information for each of the features.
        """
        # select only feature columns
        df_desc = df[selected_features]

        # compute the means and standard deviations
        means = df_desc.mean()
        stds = df_desc.std()

        # compute the number of upper and lower outliers
        lower_outliers = {}
        upper_outliers = {}
        for c in df_desc.columns:
            lower_outliers[c] = len(df_desc[df_desc[c] < means[c] - 4 * stds[c]])
            upper_outliers[c] = len(df_desc[df_desc[c] > means[c] + 4 * stds[c]])

        # generate the output data frame
        lower_s = pd.Series(lower_outliers)
        upper_s = pd.Series(upper_outliers)
        both_s = lower_s + upper_s
        df_output = pd.DataFrame(
            {
                "lower": lower_s,
                "upper": upper_s,
                "both": both_s,
                "lowerperc": round(lower_s / len(df_desc) * 100, 2),
                "upperperc": round(upper_s / len(df_desc) * 100, 2),
                "bothperc": round(both_s / len(df_desc) * 100, 2),
            }
        )

        return df_output



[docs]
    @staticmethod
    def compute_pca(
        df: pd.DataFrame, selected_features: List[str]
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Compute PCA decomposition of the given features.

        This method computes the PCA decomposition of features in the
        data frame, restricted to the given columns. The number of components
        is set to be min(n_features, n_samples).

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame containing feature values.
        selected_features : List[str]
            List of feature names to be used in the PCA decomposition.

        Returns
        -------
        df_components : pandas.DataFrame
            Data frame containing the PCA components.
        df_variance : pandas.DataFrame
            Data frame containing the variance information.
        """
        # restrict to the given features
        df_pca = df[selected_features]

        # fit the PCA
        n_components = min(len(selected_features), len(df_pca))
        pca = PCA(n_components=n_components)
        pca.fit(df_pca)

        df_components = pd.DataFrame(pca.components_)
        n_components = len(df_components)
        df_components.columns = selected_features
        df_components.index = [f"PC{i}" for i in range(1, n_components + 1)]
        df_components = df_components.transpose()

        # compute the variance data frame
        df_variance_dict = {
            "Eigenvalues": pca.explained_variance_,
            "Percentage of variance": pca.explained_variance_ratio_,
            "Cumulative percentage of " "variance": np.cumsum(pca.explained_variance_ratio_),
        }

        df_variance = pd.DataFrame(df_variance_dict)

        # reorder the columns
        df_variance = df_variance[
            [
                "Eigenvalues",
                "Percentage of variance",
                "Cumulative percentage of variance",
            ]
        ]

        # set the row names and take the transpose
        df_variance.index = [f"PC{i}" for i in range(1, n_components + 1)]
        df_variance = df_variance.transpose()

        return df_components, df_variance



[docs]
    @staticmethod
    def correlation_helper(
        df: pd.DataFrame, target_variable: str, grouping_variable: str, include_length: bool = False
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Compute marginal and partial correlations for all columns.

        This helper method computes marginal and partial correlations of
        all the columns in the given data frame against the target variable
        separately for each level in the the grouping variable.
        If ``include_length`` is ``True``, it additionally computes partial
        correlations of each column in the data frame against the target
        variable after controlling for the "length" column.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame containing numeric feature values, the numeric
            `target variable` and the `grouping variable`.
        target_variable: str
            The name of the column used as a reference for computing correlations.
        grouping_variable: str
            The name of the column defining groups in the data
        include_length: bool
            If True compute additional partial correlations of each column
            in the data frame against `target variable` only partialling out
            "length" column.

        Returns
        -------
        df_target_cors : pandas.DataFrame
            Data frame containing Pearson's correlation coefficients for
            marginal correlations between features and `target_variable`.
        df_target_partcors : pandas.DataFrame
            Data frame containing Pearson's correlation coefficients for
            partial correlations between each feature and `target_variable`
            after controlling for all other features. If ``include_length`` is
            set to ``True``, the "length" column will not be included in the
            partial correlation computation.
        df_target_partcors_no_length: pandas.DataFrame
            If ``include_length`` is set to ``True``: Data frame containing
            Pearson's correlation coefficients for partial correlations
            between each feature and ``target_variable`` after controlling
            for "length". Otherwise, it will be an empty data frame.
        """
        # group by the group columns
        grouped = df.groupby(grouping_variable)

        df_target_cors = pd.DataFrame()
        df_target_pcorr = pd.DataFrame()
        df_target_pcorr_no_length = pd.DataFrame()

        for group, df_group in grouped:
            df_group = df_group.drop(grouping_variable, axis=1)

            # first check if we have at least 2 cases and return np.nan otherwise
            if len(df_group) == 1:
                df_target_cors[group] = pd.Series(data=np.nan, index=df_group.columns)
                df_target_pcorr[group] = pd.Series(data=np.nan, index=df_group.columns)
                df_target_pcorr_no_length[group] = pd.Series(data=np.nan, index=df_group.columns)
            else:
                # if we are asked to include length, that means 'length' is
                # in the data frame which means that we want to exclude that
                # before computing the regular marginal and partial correlations
                if not include_length:
                    df_target_cors[group] = df_group.apply(
                        lambda s: pearsonr(s, df_group[target_variable])[0]
                    )
                    df_target_pcorr[group] = partial_correlations(df_group)[target_variable]
                else:
                    df_group_no_length = df_group.drop("length", axis=1)

                    partial_pearsonr = partial(pearsonr, y=df_group_no_length[target_variable])
                    df_target_cors[group] = df_group_no_length.apply(
                        lambda s: partial_pearsonr(s)[0]
                    )

                    df_target_pcorr[group] = partial_correlations(df_group_no_length)[
                        target_variable
                    ]
                    pcor_dict = {}
                    columns = [c for c in df_group.columns if c not in ["sc1", "length"]]
                    for c in columns:
                        pcor_dict[c] = partial_correlations(df_group[[c, "sc1", "length"]])["sc1"][
                            c
                        ]
                    df_target_pcorr_no_length[group] = pd.Series(pcor_dict)

        # remove the row containing the correlation of the target variable
        # with itself and take the transpose
        df_target_cors = df_target_cors.drop(target_variable).transpose()
        df_target_pcorr = df_target_pcorr.drop(target_variable).transpose()
        df_target_pcorr_no_length = df_target_pcorr_no_length.transpose()

        return (df_target_cors, df_target_pcorr, df_target_pcorr_no_length)



[docs]
    @staticmethod
    def metrics_helper(
        human_scores: pd.Series,
        system_scores: pd.Series,
        population_human_score_sd: Optional[float] = None,
        population_system_score_sd: Optional[float] = None,
        population_human_score_mn: Optional[float] = None,
        population_system_score_mn: Optional[float] = None,
        smd_method: str = "unpooled",
        use_diff_std_means: bool = False,
    ) -> pd.Series:
        """
        Compute basic association metrics between system and human scores.

        Parameters
        ----------
        human_scores : pandas.Series
            Series containing numeric human (reference) scores.
        system_scores: pandas.Series
            Series containing numeric scores predicted by the model.
        population_human_score_sd : Optional[float]
            Reference standard deviation for human scores.
            This must be specified when the function is used to compute
            association metrics for a subset of responses, for example,
            responses from a particular demographic subgroup. If ``smd_method``
            is set to "williamson" or "johnson", this should be the standard
            deviation for the whole population (in most cases, the standard
            deviation for the whole test set). If ``use_diff_std_means`` is
            ``True``, this must be the standard deviation for the whole
            population and ``population_human_score_mn`` must also be specified.
            Otherwise, it is ignored.
            Defaults to ``None``.
        population_system_score_sd : Optional[float]
            Reference standard deviation for system scores.
            This must be specified when the function is used to compute
            association metrics for a subset of responses, for example,
            responses from a particular demographic subgroup. If ``smd_method``
            is set to "williamson", this should be the standard deviation for
            the whole population (in most cases, the standard deviation for the
            whole test set). If ``use_diff_std_means`` is ``True``, this must
            be the standard deviation for the whole population and
            ``population_system_score_mn`` must also be specified. Otherwise,
            it is ignored.
            Defaults to ``None``.
        population_human_score_mn : Optional[float]
            Reference mean for human scores. This must be specified when the
            function is used to compute association metrics for a subset of
            responses, for example, responses from a particular demographic
            subgroup. If ``use_diff_std_means`` is ``True``, this must be the
            mean for the whole population (in most cases, the full test set)
            and  ``population_human_score_sd`` must also be specified.
            Otherwise, it is ignored.
            Defaults to ``None``.
        population_system_score_mn : Optional[float]
            Reference mean for system scores. This must be specified when the
            function is used to compute association metrics for a subset of
            responses, for example, responses from a particular demographic
            subgroup. If ``use_diff_std_means`` is ``True``, this must be the
            mean for the whole population (in most cases, the full test set)
            and ``population_system_score_sd`` must also be specified. Otherwise,
            it is ignored.
            Defaults to ``None``.
        smd_method : str
            The SMD method to use, only used if ``use_diff_std_means`` is
            ``False``. All methods have the same numerator
            mean(`y_pred`) - mean(`y_true_observed`) and the following
            denominators :

            - "williamson": pooled population standard deviation of
              `y_true_observed` and `y_pred` computed using
              ``population_human_score_sd`` and ``population_system_score_sd``.
            - "johnson": ``population_human_score_sd``.
            - "pooled": pooled standard deviation of `y_true_observed` and
              `y_pred` for this group.
            - "unpooled": standard deviation of `y_true_observed` for this
              group.

            Defaults to "unpooled".

        use_diff_std_means : bool
            Whether to use the difference of standardized means, rather than
            the standardized mean difference. This is most useful with subgroup
            analysis.
            Defaults to ``False``.

        Returns
        -------
        metrics: pandas.Series
            Series containing different evaluation metrics comparing human
            and system scores. The following metrics are included:

            - `kappa`:  unweighted Cohen's kappa
            - `wtkappa`:  quadratic weighted kappa
            - `exact_agr`: exact agreement
            - `adj_agr`: adjacent agreement with tolerance set to 1
            - One of the following :

              * `SMD`: standardized mean difference, if ``use_diff_std_means``
                is ``False``.
              * `DSM`: difference of standardized means, if ``use_diff_std_means``
                is ``True``.

            - `corr`: Pearson's r
            - `R2`: r squared
            - `RMSE`: root mean square error
            - `sys_min`: min system score
            - `sys_max`: max system score
            - `sys_mean`: mean system score (ddof=1)
            - `sys_sd`: standard deviation of system scores (ddof=1)
            - `h_min`: min human score
            - `h_max`: max human score
            - `h_mean`: mean human score (ddof=1)
            - `h_sd`: standard deviation of human scores (ddof=1)
            - `N`: total number of responses
        """
        # compute the kappas
        unweighted_kappa = kappa(human_scores, system_scores)
        weighted_kappa = quadratic_weighted_kappa(human_scores, system_scores)

        # compute the agreement statistics
        human_system_agreement = agreement(human_scores, system_scores)
        human_system_adjacent_agreement = agreement(human_scores, system_scores, tolerance=1)

        # compute the Pearson correlation after removing
        # any cases where either of the scores are NaNs.
        df = pd.DataFrame({"human": human_scores, "system": system_scores}).dropna(how="any")

        if len(df) == 1 or len(df["human"].unique()) == 1 or len(df["system"].unique()) == 1:
            # set correlations to 1 if we have a single instance or zero variance
            correlations = np.nan
        else:
            correlations = pearsonr(df["human"], df["system"])[0]

        # compute the min/max/mean/std. dev. for the system and human scores
        min_system_score = np.min(system_scores)
        min_human_score = np.min(human_scores)

        max_system_score = np.max(system_scores)
        max_human_score = np.max(human_scores)

        mean_system_score = np.mean(system_scores)
        mean_human_score = np.mean(human_scores)

        system_score_sd = np.std(system_scores, ddof=1)
        human_score_sd = np.std(human_scores, ddof=1)

        if use_diff_std_means:
            # calculate the difference of standardized means
            smd_name = "DSM"
            smd = difference_of_standardized_means(
                human_scores,
                system_scores,
                population_human_score_mn,
                population_system_score_mn,
                population_human_score_sd,
                population_system_score_sd,
            )

        else:
            # calculate the standardized mean difference
            smd_name = "SMD"
            smd = standardized_mean_difference(
                human_scores,
                system_scores,
                population_human_score_sd,
                population_system_score_sd,
                method=smd_method,
            )

        # compute r2
        if len(df) == 1:
            r2 = np.nan
        else:
            r2 = r2_score(human_scores, system_scores)

        # compute MSE
        mse = mean_squared_error(human_scores, system_scores)
        rmse = np.sqrt(mse)

        # return everything as a series
        metrics = pd.Series(
            {
                "kappa": unweighted_kappa,
                "wtkappa": weighted_kappa,
                "exact_agr": human_system_agreement,
                "adj_agr": human_system_adjacent_agreement,
                smd_name: smd,
                "corr": correlations,
                "R2": r2,
                "RMSE": rmse,
                "sys_min": min_system_score,
                "sys_max": max_system_score,
                "sys_mean": mean_system_score,
                "sys_sd": system_score_sd,
                "h_min": min_human_score,
                "h_max": max_human_score,
                "h_mean": mean_human_score,
                "h_sd": human_score_sd,
                "N": len(system_scores),
            }
        )

        return metrics



[docs]
    @staticmethod
    def compute_disattenuated_correlations(
        human_system_corr: pd.Series, human_human_corr: pd.Series
    ) -> pd.DataFrame:
        """
        Compute disattenuated correlations between human and system scores.

        These are computed as the Pearson's correlation between the human score
        and the system score divided by the square root of correlation between
        two human raters.

        Parameters
        ----------
        human_system_corr : pandas.Series
            Series containing of pearson's correlation coefficients human-system
            correlations.
        human_human_corr : pandas.Series
            Series containing of pearson's correlation coefficients for human-human
            correlations. This can contain a single value or have the index
            matching that of human-system correlations.

        Returns
        -------
        df_correlations: pandas.DataFrame
            Data frame containing the human-system correlations, human-human
            correlations, and disattenuated correlations.
        """
        # if we only have a single value for human correlation and the index
        # is not in human-system values, we use the same HH value in all cases
        if len(human_human_corr) == 1 and human_human_corr.index[0] not in human_system_corr.index:
            human_human_corr = pd.Series(
                human_human_corr.values.repeat(len(human_system_corr)),
                index=human_system_corr.index,
            )

        # we now concatenate the two series on index
        df_correlations = pd.concat(
            [human_system_corr, human_human_corr],
            axis=1,
            sort=True,
            keys=["corr_HM", "corr_HH"],
        )

        # if any of the HH correlations are negative, we will ignore these
        # and treat them as Nones
        with np.errstate(invalid="ignore"):
            df_correlations["sqrt_HH"] = np.sqrt(df_correlations["corr_HH"])

        df_correlations["corr_disattenuated"] = (
            df_correlations["corr_HM"] / df_correlations["sqrt_HH"]
        )

        return df_correlations



[docs]
    def compute_correlations_by_group(
        self,
        df: pd.DataFrame,
        selected_features: List[str],
        target_variable: str,
        grouping_variable: str,
        include_length: bool = False,
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Compute marginal and partial correlations against target variable.

        This method computes various marginal and partial correlations of the
        given columns in the given data frame against the target variable for
        all data and for each level of the grouping variable.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame.
        selected_features : List[str]
            List of feature names for which to compute the correlations.
        target_variable : str
            Feature name indicating the target variable i.e., the dependent variable
        grouping_variable : str
            Feature name that contain the grouping information
        include_length : bool
            Whether or not to include the length when computing the partial correlations.
            Defaults to ``False``.

        Returns
        -------
        df_target_cors : pandas.DataFrame
            Data frame containing Pearson's correlation coefficients for
            marginal correlations between features and `target_variable`.
        df_target_partcors : pandas.DataFrame
            Data frame containing Pearson's correlation coefficients for
            partial correlations between each feature and `target_variable`
            after controlling for all other features. If ``include_length`` is
            set to ``True``, the "length" column will not be included in the
            partial correlation computation.
        df_target_partcors_no_length: pandas.DataFrame
            If ``include_length`` is set to ``True``: Data frame containing
            Pearson's correlation coefficients for partial correlations
            between each feature and ``target_variable`` after controlling
            for "length". Otherwise, it will be an empty data frame.
        """
        df_desc = df.copy()

        columns = selected_features + [target_variable, grouping_variable]
        if include_length:
            columns.append("length")
        df_desc = df_desc[columns]

        # create a duplicate data frame to compute correlations
        # over the whole data, i.e., across all grouping variables
        df_desc_all = df_desc.copy()
        df_desc_all[grouping_variable] = "All data"

        # combine the two data frames
        df_desc_combined = pd.concat([df_desc, df_desc_all], sort=True)
        df_desc_combined.reset_index(drop=True, inplace=True)

        # compute the various (marginal and partial) correlations with score
        return self.correlation_helper(
            df_desc_combined,
            target_variable,
            grouping_variable,
            include_length=include_length,
        )



[docs]
    def filter_metrics(
        self,
        df_metrics: pd.DataFrame,
        use_scaled_predictions: bool = False,
        chosen_metric_dict: Optional[Dict[str, List[str]]] = None,
    ) -> pd.DataFrame:
        """
        Filter data frame to retain only the given metrics.

        This method filters the data frame ``df_metrics`` -- containing
        all of the metric values by all score types (raw, raw_trim etc.)
        -- to retain only the metrics as defined in the given dictionary
        ``chosen_metric_dict``. This dictionary maps score types ("raw",
        "scale", "raw_trim" etc.) to metric names. The available metric
        names are:

        - "corr"
        - "kappa"
        - "wtkappa"
        - "exact_agr"
        - "adj_agr"
        - "SMD" or "DSM", depending on what is in ``df_metrics``.
        - "RMSE"
        - "R2"
        - "sys_min"
        - "sys_max"
        - "sys_mean"
        - "sys_sd"
        - "h_min"
        - "h_max"
        - "h_mean"
        - "h_sd"
        - "N"

        Parameters
        ----------
        df_metrics : pandas.DataFrame
            The DataFrame to filter.
        use_scaled_predictions : bool
            Whether to use scaled predictions.
            Defaults to ``False``.
        chosen_metric_dict : Optional[Dict[str, List[str]]]
            The dictionary mapping each score type to the metrics that
            should be computed for it.
            Defaults to ``None``.

        Returns
        -------
        df_filtered_metrics : pandas.DataFrame
            The filtered DataFrame.

        Note
        ----
        The last five metrics will be the `same` for all score types.
        If ``chosen_metric_dict`` is not specified, the following default
        dictionary with the recommended metrics is used::

            {"X_trim": ["N", "h_mean", "h_sd", "sys_mean", "sys_sd", "wtkappa",
                          "corr", "RMSE", "R2", "SMD"],
             "X_trim_round": ["sys_mean", "sys_sd", "kappa",
                                "exact_agr", "adj_agr", "SMD"]}

        where X = "raw" or "scale" depending on whether
        ``use_scaled_predictions`` is ``False`` or ``True``, respectively.
        """
        # do we want the raw or the scaled metrics
        score_prefix = "scale" if use_scaled_predictions else "raw"

        # what metrics are we choosing to include?
        if chosen_metric_dict:
            chosen_metrics = chosen_metric_dict
        else:
            smd_name = "DSM" if "DSM" in df_metrics else "SMD"
            chosen_metrics = {
                f"{score_prefix}_trim": [
                    "N",
                    "h_mean",
                    "h_sd",
                    "sys_mean",
                    "sys_sd",
                    "wtkappa",
                    "corr",
                    smd_name,
                    "RMSE",
                    "R2",
                ],
                f"{score_prefix}_trim_round": [
                    "sys_mean",
                    "sys_sd",
                    "kappa",
                    "exact_agr",
                    "adj_agr",
                    smd_name,
                ],
            }

        # extract the metrics we need from the given metrics frame
        metricdict = {}
        for score_type in chosen_metrics:
            for metric in chosen_metrics[score_type]:
                colname = metric if metric in ["h_mean", "h_sd", "N"] else f"{metric}.{score_type}"
                values = df_metrics[metric][score_type]
                metricdict[colname] = values

        df_filtered_metrics = pd.DataFrame([metricdict])
        return df_filtered_metrics



[docs]
    def compute_metrics(
        self,
        df: pd.DataFrame,
        compute_shortened: bool = False,
        use_scaled_predictions: bool = False,
        include_second_score: bool = False,
        population_sd_dict: Optional[Dict[str, Optional[float]]] = None,
        population_mn_dict: Optional[Dict[str, Optional[float]]] = None,
        smd_method: str = "unpooled",
        use_diff_std_means: bool = False,
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Compute association metrics for scores in the given data frame.

        This function compute association metrics for all score types.
        If ``include_second_score`` is ``True``, then it is assumed that
        a column called `sc2` containing a second human score is available
        and it should be used to compute the human-human evaluation stats
        and the performance degradation statistics.

        If ``compute_shortened`` is ``True``, then this function also
        computes a shortened version of the full human-system metrics data
        frame. See ``filter_metrics()`` for the description of the default
        columns included in the shortened data frame.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame
        compute_shortened : bool
            Also compute a shortened version of the full metrics data frame.
            Defaults to ``False``.
        use_scaled_predictions : bool
            Use evaluations based on scaled predictions in the shortened version
            of the metrics data frame.
            Defaults to ``False``.
        include_second_score : bool
            Second human score available.
            Defaults to ``False``.
        population_sd_dict : Optional[Dict[str, Optional[float, None]]]
            Dictionary containing population standard deviation for each column
            containing human or system scores. This is used to compute SMD for
            subgroups. If ``None``, a dummy dictionary is created that sets
            the standard deviation for all columns to ``None``.
            Defaults to ``None``.
        population_mn_dict : Optional[Dict[str, Optional[float]]]
            Dictionary containing population mean for each column containing
            human or system scores. This is used to compute SMD for subgroups.
            If ``None``, a dummy dictionary is created that sets the standard
            deviation for all columns to ``None``.
            Defaults to ``None``.
        smd_method : str
            The SMD method to use, only used if ``use_diff_std_means`` is
            ``False``. All methods have the same numerator
            mean(`y_pred`) - mean(`y_true_observed`) and the following
            denominators:

            - "williamson": pooled population standard deviation of human and
              system scores computed based on values in ``population_sd_dict``.
            - "johnson": population standard deviation of human scores computed
              based on values in ``population_sd_dict``.
            - "pooled": pooled standard deviation of `y_true_observed` and
              `y_pred` for this group.
            - "unpooled": standard deviation of `y_true_observed` for this
              group.

            Defaults to "unpooled".

        use_diff_std_means : bool
            Whether to use the difference of standardized means, rather than
            the standardized mean difference. This is most useful with subgroup
            analysis.
            Defaults to ``False``.

        Returns
        -------
        df_human_system_eval : pandas.DataFrame
            Data frame containing the full set of evaluation
            metrics.
        df_human_system_eval_filtered : pandas.DataFrame
            Data frame containing the human-human statistics
            but is empty if ``include_second_score`` is ``False``.
        df_human_human_eval : pandas.DataFrame
            A shortened version of the first data frame but
            is empty if ``compute_shortened`` is ``False``.
        """
        # shorter variable name is easier to work with
        use_scaled = use_scaled_predictions

        # are we using DSM or SMD?
        smd_name = "DSM" if use_diff_std_means else "SMD"

        # get the population standard deviations for SMD if none were supplied
        if not population_sd_dict:
            population_sd_dict = {col: None for col in df.columns}

        # get the population standard deviations for SMD if none were supplied
        if not population_mn_dict:
            population_mn_dict = {col: None for col in df.columns}

        # if the second human score column is available, the values are
        # probably not available for all of the responses in the test
        # set and so we want to exclude 'sc2' from human-system metrics
        # computation. In addition, we also want to compute the human-human
        # metrics only on the data that is double scored.
        df_human_human = pd.DataFrame()
        if include_second_score:
            df_single = df.drop("sc2", axis=1)

            df_human_system = df_single.apply(
                lambda s: self.metrics_helper(
                    df_single["sc1"],
                    s,
                    population_sd_dict["sc1"],
                    population_sd_dict[s.name],
                    population_mn_dict["sc1"],
                    population_mn_dict[s.name],
                    smd_method,
                    use_diff_std_means,
                )
            )
            df_double = df[df["sc2"].notnull()][["sc1", "sc2"]]
            df_human_human = df_double.apply(
                lambda s: self.metrics_helper(
                    df_double["sc1"],
                    s,
                    population_sd_dict["sc1"],
                    population_sd_dict[s.name],
                    population_mn_dict["sc1"],
                    population_mn_dict[s.name],
                    "pooled",
                    use_diff_std_means,
                )
            )
            # drop the sc1 column from the human-human agreement frame
            df_human_human = df_human_human.drop("sc1", axis=1)

            # sort the rows in the correct order
            df_human_human = df_human_human.reindex(
                [
                    "N",
                    "h_mean",
                    "h_sd",
                    "h_min",
                    "h_max",
                    "sys_mean",
                    "sys_sd",
                    "sys_min",
                    "sys_max",
                    "corr",
                    "wtkappa",
                    "R2",
                    "kappa",
                    "exact_agr",
                    "adj_agr",
                    smd_name,
                    "RMSE",
                ]
            )
            # rename `h_*` -> `h1_*` and `sys_*` -> `h2_*`
            df_human_human.rename(
                lambda c: c.replace("h_", "h1_").replace("sys_", "h2_"), inplace=True
            )
            # drop RMSE and R2 because they are not meaningful for human raters
            df_human_human.drop(["R2", "RMSE"], inplace=True)
            df_human_human = df_human_human.transpose()
            # convert N to integer if it's not empty else set to 0
            try:
                df_human_human["N"] = df_human_human["N"].astype(int)
            except ValueError:
                df_human_human["N"] = 0
            df_human_human.index = [""]
        else:
            df_human_system = df.apply(
                lambda s: self.metrics_helper(
                    df["sc1"],
                    s,
                    population_sd_dict["sc1"],
                    population_sd_dict[s.name],
                    population_mn_dict["sc1"],
                    population_mn_dict[s.name],
                    smd_method,
                    use_diff_std_means,
                )
            )

        # drop 'sc1' column from the human-system frame and transpose
        df_human_system = df_human_system.drop("sc1", axis=1)
        df_human_system = df_human_system.transpose()

        # sort the columns and rows in the correct order
        df_human_system = df_human_system[
            [
                "N",
                "h_mean",
                "h_sd",
                "h_min",
                "h_max",
                "sys_mean",
                "sys_sd",
                "sys_min",
                "sys_max",
                "corr",
                "wtkappa",
                "R2",
                "kappa",
                "exact_agr",
                "adj_agr",
                smd_name,
                "RMSE",
            ]
        ]

        # make N column an integer if it's not NaN else set it to 0
        df_human_system["N"] = df_human_system["N"].astype(int)
        all_rows_order = [
            "raw",
            "raw_trim",
            "raw_trim_round",
            "scale",
            "scale_trim",
            "scale_trim_round",
        ]
        existing_rows_index = [row for row in all_rows_order if row in df_human_system.index]
        df_human_system = df_human_system.reindex(existing_rows_index)

        # extract some default metrics for a shorter version of this data frame
        # if we were asked to do so
        if compute_shortened:
            df_human_system_filtered = self.filter_metrics(
                df_human_system, use_scaled_predictions=use_scaled
            )
        else:
            df_human_system_filtered = pd.DataFrame()

        # return all data frames
        return (df_human_system, df_human_system_filtered, df_human_human)



[docs]
    def compute_metrics_by_group(
        self,
        df_test: pd.DataFrame,
        grouping_variable: str,
        use_scaled_predictions: bool = False,
        include_second_score: bool = False,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Compute a subset of evaluation metrics by subgroups.

        This method computes a subset of evalution metrics for the scores
        in the given data frame by group specified in ``grouping_variable``.
        See ``filter_metrics()`` above for a description of the subset
        that is selected.

        Parameters
        ----------
        df_test : pandas.DataFrame
            Input data frame.
        grouping_variable : str
            Feature name indicating the column that contains grouping information.
        use_scaled_predictions : bool
            Include scaled predictions when computing the evaluation metrics.
            Defaults to ``False``.
        include_second_score : bool
            Include human-human association statistics.
            Defaults to ``False``.

        Returns
        -------
        df_human_system_by_group : pandas.DataFrame
            Data frame containing the correlation human-system association statistics.
        df_human_human_by_group : pandas.DataFrame
            Data frame that either contains the human-human statistics or is
            an empty data frame, depending on whether ``include_second_score``
            is `True``.
        """
        # get the population standard deviation that we will need to compute SMD for all columns
        # other than id and subgroup
        population_sd_dict = {
            col: df_test[col].std(ddof=1)
            for col in df_test.columns
            if col not in ["spkitemid", grouping_variable]
        }

        population_mn_dict = {
            col: df_test[col].mean()
            for col in df_test.columns
            if col not in ["spkitemid", grouping_variable]
        }

        # check if any of the standard deviations is zero and
        # tell user to expect to see many warnings.
        zero_sd_scores = [
            score for (score, sd) in population_sd_dict.items() if np.isclose(sd, 0, atol=1e-07)
        ]
        if len(zero_sd_scores) > 0:
            warnings.warn(
                f"The standard deviation for {', '.join(zero_sd_scores)} "
                f"scores is zero (all values are the same). You will see "
                f"multiple warnings about DSM computation since this metric "
                f"is computed separately for each subgroup."
            )

        # create a duplicate data frame to compute evaluations
        # over the whole data, i.e., across groups
        df_preds_all = df_test.copy()
        df_preds_all[grouping_variable] = "All data"

        # combine the two data frames
        df_preds_combined = pd.concat([df_test, df_preds_all], sort=True)
        df_preds_combined.reset_index(drop=True, inplace=True)

        # group by the grouping_variable columns
        grouped = df_preds_combined.groupby(grouping_variable)

        df_human_system_by_group = pd.DataFrame()
        df_human_human_by_group = pd.DataFrame()

        for group, df_group in grouped:
            df_group = df_group.drop(grouping_variable, axis=1)

            (
                df_human_system_metrics,
                df_human_system_metrics_short,
                df_human_human_metrics,
            ) = self.compute_metrics(
                df_group,
                compute_shortened=True,
                use_scaled_predictions=use_scaled_predictions,
                include_second_score=include_second_score,
                population_sd_dict=population_sd_dict,
                population_mn_dict=population_mn_dict,
                use_diff_std_means=True,
            )

            # we need to convert the shortened data frame to a series here
            df_human_system_by_group[group] = df_human_system_metrics_short.iloc[0]

            # update the by group human-human metrics frame if
            # we have the second score column available
            if include_second_score:
                df_human_human_metrics.index = [group]
                df_human_human_by_group = pd.concat(
                    [df_human_human_by_group, df_human_human_metrics]
                )

        # transpose the by group human-system metrics frame
        df_human_system_by_group = df_human_system_by_group.transpose()

        return (df_human_system_by_group, df_human_human_by_group)



[docs]
    def compute_degradation_and_disattenuated_correlations(
        self, df: pd.DataFrame, use_all_responses: bool = True
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Compute the degradation in performance when using system score.

        This method computes the degradation in performance when using the
        system to predict the score instead of a second human and also the
        disattenuated correlations between human and system scores.
        These are computed as the Pearson's correlation between the human score
        and the system score divided by the square root of correlation between
        two human raters.

        For this, we can compute the system performance either only on the
        double scored data or on the full dataset. Both options have their
        pros and cons. The default is to use the full dataset. This function
        also assumes that the `sc2` column exists in the given data frame,
        in addition to `sc1` and the various types of predictions.

        Parameters
        ----------
        df : pandas.DataFrame
            Input data frame.
        use_all_responses : bool
            Use the full data set instead of only using the double-scored subset.
            Defaults to ``True``.

        Returns
        -------
        df_degradation : pandas.DataFrame
            Data frame containing the degradation statistics.
        df_correlations : pandas.DataFrame
            Data frame containing the human-system correlations, human-human
            correlations and disattenuated correlation.
        """
        if use_all_responses:
            df_responses = df
        else:
            # use only double scored data
            df_responses = df[df["sc2"].notnull()]

        # compute the human-system and human-human metrics
        (df_human_system_eval, _, df_human_human_eval) = self.compute_metrics(
            df_responses, include_second_score=True
        )

        # compute disattenuated correlations
        df_correlations = self.compute_disattenuated_correlations(
            df_human_system_eval["corr"], df_human_human_eval["corr"]
        )

        # Compute degradation. we only care about the degradation in these metrics
        degradation_metrics = [
            "corr",
            "kappa",
            "wtkappa",
            "exact_agr",
            "adj_agr",
            "SMD",
        ]
        df_human_system_eval = df_human_system_eval[degradation_metrics]
        df_human_human_eval = df_human_human_eval[degradation_metrics]
        df_degradation = df_human_system_eval.apply(
            lambda row: row - df_human_human_eval.loc[""], axis=1
        )

        return (df_degradation, df_correlations)



[docs]
    def run_training_analyses(
        self, data_container: DataContainer, configuration: Configuration
    ) -> Tuple[Configuration, DataContainer]:
        """
        Run all analyses on the training data.

        Parameters
        ----------
        data_container : DataContainer
            The DataContainer object which must include the following
            DataFrames: {"train_features", "train_metadata",
            "train_preprocessed_features", "train_length", "train_features"}.

        configuration : Configuration
            The Configuration object which must include the following
            parameters (keys): {"length_column", "subgroups",
            "selected_features"}.

        Returns
        -------
        configuration : Configuration
            The input Configuration object that is passed through unmodified.

        data_container : DataContainer
            A new DataContainer object with the following DataFrames:

            - feature_descriptives
            - feature_descriptivesExtra
            - feature_outliers
            - cors_orig
            - cors_processed
            - margcor_score_all_data
            - pcor_score_all_data
            - pcor_score_no_length_all_data
            - margcor_length_all_data
            - pcor_length_all_data
            - pca
            - pcavar
            - margcor_length_by_*
            - pcor_length_by_*
            - margcor_score_by_*
            - pcor_score_by_*
            - pcor_score_no_length_by_*
        """
        frame_names = [
            "train_features",
            "train_metadata",
            "train_preprocessed_features",
            "train_length",
            "train_features",
        ]

        param_names = ["length_column", "subgroups", "selected_features"]

        self.check_frame_names(data_container, frame_names)

        self.check_param_names(configuration, param_names)

        # only use the features selected by the model but keep their order the same
        # as in the original file as ordering may affect the sign in pca
        df_train = data_container["train_features"].copy()
        df_train_length = data_container["train_length"].copy()
        df_train_metadata = data_container["train_metadata"].copy()
        df_train_preprocessed_features = data_container["train_preprocessed_features"].copy()

        subgroups = configuration["subgroups"]
        selected_features = configuration["selected_features"]

        df_train_preprocessed = pd.merge(
            df_train_preprocessed_features, df_train_metadata, on="spkitemid"
        )

        assert (
            len(df_train_preprocessed.index)
            == len(df_train_preprocessed_features.index)
            == len(df_train_metadata.index)
        )

        # get descriptives, percentiles and outliers for the original feature values
        df_descriptives = self.compute_basic_descriptives(df_train, selected_features)
        df_percentiles = self.compute_percentiles(df_train, selected_features)
        df_outliers = self.compute_outliers(df_train, selected_features)

        # set a general boolean flag indicating if we should include length
        include_length = not df_train_length.empty

        # include length if available
        if include_length:
            columns = selected_features + ["sc1", "length"]
            df_train_with_length = df_train.merge(df_train_length, on="spkitemid")
            df_train_preprocess_length = df_train_preprocessed.merge(
                df_train_length, on="spkitemid"
            )
        else:
            columns = selected_features + ["sc1"]
            df_train_with_length = df_train
            df_train_preprocess_length = df_train_preprocessed

        # get pairwise correlations against the original training features
        # as well as the pre-processed training features
        df_pairwise_cors_orig = df_train_with_length[columns].corr(method="pearson")
        df_pairwise_cors_preprocess = df_train_preprocess_length[columns].corr(method="pearson")

        # get marginal and partial correlations against sc1 for all data
        # for partial correlations, we partial out all other features
        df_train_with_group_for_all = df_train_preprocess_length.copy()
        df_train_with_group_for_all = df_train_with_group_for_all[columns]
        df_train_with_group_for_all["all_data"] = "All data"

        (df_margcor_sc1, df_pcor_sc1, df_pcor_sc1_no_length) = self.correlation_helper(
            df_train_with_group_for_all,
            "sc1",
            "all_data",
            include_length=include_length,
        )

        # get marginal and partial correlations against length for all data
        # if the length column is available
        df_margcor_length = pd.DataFrame()
        df_pcor_length = pd.DataFrame()

        if include_length:
            df_train_with_group_for_all = df_train_preprocess_length.copy()
            columns = selected_features + ["length"]

            df_train_with_group_for_all = df_train_with_group_for_all[columns]
            df_train_with_group_for_all["all_data"] = "All data"

            (df_margcor_length, df_pcor_length, _) = self.correlation_helper(
                df_train_with_group_for_all, "length", "all_data"
            )

        # get marginal and partial correlations against sc1 by group (preprocessed features)
        # also include partial correlations with length if length is available
        score_corr_by_group_dict = {}
        include_length = "length" in df_train_preprocess_length
        for grouping_variable in subgroups:
            corr_by_group = self.compute_correlations_by_group(
                df_train_preprocess_length,
                selected_features,
                "sc1",
                grouping_variable,
                include_length=include_length,
            )

            score_corr_by_group_dict[grouping_variable] = corr_by_group

        # get marginal and partial correlations against sc1 by group (preprocessed features)
        length_corr_by_group_dict = {}
        if include_length:
            for grouping_variable in subgroups:
                corr_by_group = self.compute_correlations_by_group(
                    df_train_preprocess_length,
                    selected_features,
                    "length",
                    grouping_variable,
                )

                length_corr_by_group_dict[grouping_variable] = corr_by_group

        # get PCA information
        df_pca_components, df_pca_variance = self.compute_pca(
            df_train_preprocessed, selected_features
        )

        # Datasets to add
        datasets = [
            DatasetDict({"name": "feature_descriptives", "frame": df_descriptives}),
            DatasetDict({"name": "feature_descriptivesExtra", "frame": df_percentiles}),
            DatasetDict({"name": "feature_outliers", "frame": df_outliers}),
            DatasetDict({"name": "cors_orig", "frame": df_pairwise_cors_orig}),
            DatasetDict({"name": "cors_processed", "frame": df_pairwise_cors_preprocess}),
            DatasetDict({"name": "margcor_score_all_data", "frame": df_margcor_sc1}),
            DatasetDict({"name": "pcor_score_all_data", "frame": df_pcor_sc1}),
            DatasetDict({"name": "pcor_score_no_length_all_data", "frame": df_pcor_sc1_no_length}),
            DatasetDict({"name": "margcor_length_all_data", "frame": df_margcor_length}),
            DatasetDict({"name": "pcor_length_all_data", "frame": df_pcor_length}),
            DatasetDict({"name": "pca", "frame": df_pca_components}),
            DatasetDict({"name": "pcavar", "frame": df_pca_variance}),
        ]

        # Add length correlation by group datasets
        for group in length_corr_by_group_dict:
            (length_marg_cors, length_part_cors, _) = length_corr_by_group_dict.get(
                group, (pd.DataFrame(), pd.DataFrame(), pd.DataFrame())
            )

            datasets.extend(
                [
                    DatasetDict({"name": f"margcor_length_by_{group}", "frame": length_marg_cors}),
                    DatasetDict({"name": f"pcor_length_by_{group}", "frame": length_part_cors}),
                ]
            )

        # Add score correlations by group datasets
        for group in score_corr_by_group_dict:
            (
                sc1_marg_cors,
                sc1_part_cors,
                sc1_part_cors_no_length,
            ) = score_corr_by_group_dict[group]

            datasets.extend(
                [
                    DatasetDict({"name": f"margcor_score_by_{group}", "frame": sc1_marg_cors}),
                    DatasetDict({"name": f"pcor_score_by_{group}", "frame": sc1_part_cors}),
                    DatasetDict(
                        {
                            "name": f"pcor_score_no_length_by_{group}",
                            "frame": sc1_part_cors_no_length,
                        }
                    ),
                ]
            )

        return configuration, DataContainer(datasets=datasets)



[docs]
    def run_prediction_analyses(
        self,
        data_container: DataContainer,
        configuration: Configuration,
        wandb_run: Union[Run, RunDisabled, None] = None,
    ) -> Tuple[Configuration, DataContainer]:
        """
        Run all analyses on the system scores (predictions).

        Parameters
        ----------
        data_container : DataContainer
            The DataContainer object which must include the following
            DataFrames: {"train_features", "train_metadata",
            "train_preprocessed_features", "train_length", "train_features"}.

        configuration : Configuration
            The Configuration object which must include the following
            parameters (keys):  {"subgroups", "second_human_score_column",
            "use_scaled_predictions"}.

        wandb_run : Union[wandb.wandb_run.Run, wandb.sdk.lib.RunDisabled, None]
            The wandb run object if wandb is enabled, ``None`` otherwise.
            If enabled, all the output data frames will be logged to this run
            as tables.
            Defaults to ``None``.

        Returns
        -------
        configuration : Configuration
            The input Configuration object that is passed through unmodified.

        data_container : DataContainer
            A new DataContainer object with the following DataFrames:

            - eval
            - eval_short
            - consistency
            - degradation
            - disattenudated_correlations
            - confMatrix
            - confMatrix_h1h2
            - score_dist
            - eval_by_*
            - consistency_by_*
            - disattenduated_correlations_by_*
            - true_score_eval

        """
        frame_names = ["pred_test", "test_metadata", "test_human_scores"]

        param_names = [
            "subgroups",
            "second_human_score_column",
            "use_scaled_predictions",
        ]

        self.check_frame_names(data_container, frame_names)

        self.check_param_names(configuration, param_names)

        df_test = data_container["pred_test"].copy()
        df_test_metadata = data_container["test_metadata"].copy()
        df_test_human_scores = data_container["test_human_scores"].copy()

        subgroups = configuration["subgroups"]
        use_scaled_predictions = configuration["use_scaled_predictions"]

        df_preds = pd.merge(df_test, df_test_metadata, on="spkitemid")

        assert len(df_preds.index) == len(df_test.index) == len(df_test_metadata.index)

        # set a general boolean flag indicating if
        # we should include the second human score
        include_second_score = not df_test_human_scores.empty

        # extract the columns that contain predictions
        prediction_columns = [column for column in df_test if column != "spkitemid"]

        # if a second score is available, use it
        if include_second_score:
            prediction_columns.append("sc2")
            df_preds_second_score = df_preds.merge(
                df_test_human_scores[["spkitemid", "sc2"]], on="spkitemid"
            )
        else:
            df_preds_second_score = df_preds

        # compute the evaluation metrics over the whole data set
        (df_human_system, df_human_system_short, df_human_human) = self.compute_metrics(
            df_preds_second_score[prediction_columns],
            compute_shortened=True,
            use_scaled_predictions=use_scaled_predictions,
            include_second_score=include_second_score,
        )

        # compute the evaluation metrics by group
        eval_by_group_dict = {}
        for group in subgroups:
            group_columns = prediction_columns + [group]
            metrics = self.compute_metrics_by_group(
                df_preds_second_score[group_columns],
                group,
                use_scaled_predictions=use_scaled_predictions,
                include_second_score=include_second_score,
            )
            eval_by_group_dict[group] = metrics

        # compute the degradation statistics and disattenuated correlations
        # if we have the second human score available
        df_degradation = pd.DataFrame()
        df_correlations = pd.DataFrame()
        if include_second_score:
            (
                df_degradation,
                df_correlations,
            ) = self.compute_degradation_and_disattenuated_correlations(
                df_preds_second_score[prediction_columns]
            )

        # the following two evaluations require rounded human scores
        # we create a column for this
        df_preds["sc1_round"] = np.round(df_preds["sc1"])

        # compute the confusion matrix as a data frame
        score_type = "scale" if use_scaled_predictions else "raw"
        human_scores = df_preds["sc1_round"].astype("int64")
        system_scores = df_preds[f"{score_type}_trim_round"].astype("int64")
        conf_matrix = confusion_matrix(human_scores, system_scores)
        labels = sorted(pd.concat([human_scores, system_scores]).unique())
        df_confmatrix = pd.DataFrame(conf_matrix, index=labels, columns=labels)
        # log confusion matrix to W&B
        wandb.log_confusion_matrix(
            wandb_run,
            human_scores,
            system_scores,
            "Human-System Confusion Matrix",
            configuration.context,
        )

        # compute the score distributions of the rounded human and system scores
        df_score_dist = df_preds[["sc1_round", f"{score_type}_trim_round"]].apply(
            lambda s: s.value_counts() / len(df_test) * 100
        )

        # compute a human1-human2 confusion matrix, if possible
        if include_second_score:
            df_preds_second_score_double_scored = df_preds_second_score.copy().dropna(
                subset=["sc1", "sc2"]
            )
            human1_scores = df_preds_second_score_double_scored["sc1"].round().astype("int64")
            human2_scores = df_preds_second_score_double_scored["sc2"].round().astype("int64")
            conf_matrix_h1h2 = confusion_matrix(human1_scores, human2_scores)
            labels = sorted(pd.concat([human1_scores, human2_scores]).unique())
            df_confmatrix_h1h2 = pd.DataFrame(conf_matrix_h1h2, index=labels, columns=labels)
            wandb.log_confusion_matrix(
                wandb_run,
                human1_scores,
                human2_scores,
                "Human1-Human2 Confusion Matrix",
                configuration.context,
            )

        # Replace any NaNs, which we might get because our model never
        # predicts a particular score label, with zeros.
        df_score_dist.fillna(0, inplace=True)

        df_score_dist.columns = ["human", f"sys_{score_type}"]
        df_score_dist["difference"] = df_score_dist[f"sys_{score_type}"] - df_score_dist["human"]
        df_score_dist["score"] = df_score_dist.index

        df_score_dist = df_score_dist[["score", "human", f"sys_{score_type}", "difference"]]
        df_score_dist.sort_values(by="score", inplace=True)

        datasets: List[DatasetDict] = [
            {"name": "eval", "frame": df_human_system},
            {"name": "eval_short", "frame": df_human_system_short},
            {"name": "consistency", "frame": df_human_human},
            {"name": "degradation", "frame": df_degradation},
            {"name": "disattenuated_correlations", "frame": df_correlations},
            {"name": "confMatrix", "frame": df_confmatrix},
            {"name": "score_dist", "frame": df_score_dist},
        ]

        # compute true-score analyses if we have second score
        # or have been given rater error variance
        rater_error_variance = configuration.get_rater_error_variance()

        if include_second_score or rater_error_variance is not None:
            system_score_columns = [col for col in prediction_columns if col not in ["sc1", "sc2"]]

            human_score_columns = [col for col in prediction_columns if col in ["sc1", "sc2"]]

            df_prmse = get_true_score_evaluations(
                df_preds_second_score,
                system_score_columns,
                human_score_columns,
                rater_error_variance,
            )

            datasets.extend([{"name": "true_score_eval", "frame": df_prmse}])

        if include_second_score:
            datasets.extend([{"name": "confMatrix_h1h2", "frame": df_confmatrix_h1h2}])

        for group in eval_by_group_dict:
            eval_by_group, consistency_by_group = eval_by_group_dict[group]

            # compute disattenuated correlations if we have the second human score
            if include_second_score:
                dis_corr_by_group = self.compute_disattenuated_correlations(
                    eval_by_group[f"corr.{score_type}_trim"],
                    consistency_by_group["corr"],
                )
            else:
                dis_corr_by_group = pd.DataFrame()

            datasets.extend(
                [
                    {"name": f"eval_by_{group}", "frame": eval_by_group},
                    {"name": f"consistency_by_{group}", "frame": consistency_by_group},
                    {
                        "name": f"disattenuated_correlations_by_{group}",
                        "frame": dis_corr_by_group,
                    },
                ]
            )

        return configuration, DataContainer(datasets=datasets)



[docs]
    def run_data_composition_analyses_for_rsmtool(
        self, data_container: DataContainer, configuration: Configuration
    ) -> Tuple[Configuration, DataContainer]:
        """
        Run all data composition analyses for RSMTool.

        Parameters
        ----------
        data_container : DataContainer
            The DataContainer object which must include the following
            DataFrames: {"test_metadata", "train_metadata","train_excluded",
            "test_excluded", "train_features"}.
        configuration : Configuration
            The Configuration object which must include the
            following parameters (keys): {"subgroups", "candidate_column",
            "exclude_zero_scores", "exclude_listwise"}.

        Returns
        -------
        configuration : Configuration
            The input Configuration object that is passed through unmodified.

        data_container : DataContainer
            A new DataContainer object with the following DataFrames:

            - test_excluded_composition
            - train_excluded_composition
            - data_composition
            - data_composition_by_*
        """
        frame_names = [
            "train_metadata",
            "test_metadata",
            "train_excluded",
            "test_excluded",
            "train_features",
        ]

        param_names = [
            "candidate_column",
            "subgroups",
            "exclude_zero_scores",
            "exclude_listwise",
        ]

        self.check_frame_names(data_container, frame_names)

        self.check_param_names(configuration, param_names)

        features = [
            column
            for column in data_container["train_features"].columns
            if column not in ["spkitemid", "sc1"]
        ]

        exclude_scores = configuration["exclude_zero_scores"]
        exclude_listwise = configuration["exclude_listwise"]

        subgroups = configuration["subgroups"]
        candidate_column = configuration["candidate_column"]

        df_train_excluded = self.analyze_excluded_responses(
            data_container["train_excluded"],
            features,
            "Score/Features",
            exclude_zero_scores=exclude_scores,
            exclude_listwise=exclude_listwise,
        )

        df_test_excluded = self.analyze_excluded_responses(
            data_container["test_excluded"],
            features,
            "Score/Features",
            exclude_zero_scores=exclude_scores,
            exclude_listwise=exclude_listwise,
        )

        df_data_composition = self.analyze_used_responses(
            data_container["train_metadata"],
            data_container["test_metadata"],
            subgroups,
            candidate_column,
        )

        # do the analysis by subgroups
        # first create a joint data frame with both sets
        df_train_metadata_with_set = data_container["train_metadata"].copy()
        df_test_metadata_with_set = data_container["test_metadata"].copy()

        df_train_metadata_with_set["set"] = "Training set"
        df_test_metadata_with_set["set"] = "Evaluation set"

        df_both_metadata = pd.merge(
            df_train_metadata_with_set, df_test_metadata_with_set, how="outer"
        )

        # create contingency table for each subgroup
        data_composition_by_group_dict = {}
        for grouping_variable in subgroups:
            df_crosstab_group = pd.crosstab(
                df_both_metadata[grouping_variable], df_both_metadata["set"]
            )
            df_crosstab_group = df_crosstab_group[["Training set", "Evaluation set"]]
            df_crosstab_group.insert(0, grouping_variable, df_crosstab_group.index)
            data_composition_by_group_dict[grouping_variable] = df_crosstab_group

        datasets: List[DatasetDict] = [
            DatasetDict({"name": "test_excluded_composition", "frame": df_test_excluded}),
            DatasetDict({"name": "train_excluded_composition", "frame": df_train_excluded}),
            DatasetDict({"name": "data_composition", "frame": df_data_composition}),
        ]

        for group in data_composition_by_group_dict:
            datasets.append(
                DatasetDict(
                    {
                        "name": f"data_composition_by_{group}",
                        "frame": data_composition_by_group_dict[group],
                    }
                )
            )

        return configuration, DataContainer(datasets=datasets)



[docs]
    def run_data_composition_analyses_for_rsmeval(
        self, data_container: DataContainer, configuration: Configuration
    ) -> Tuple[Configuration, DataContainer]:
        """
        Run all data composition analyses for RSMEval.

        Parameters
        ----------
        data_container : DataContainer
            The DataContainer object which must include the following
            DataFrames: {"test_metadata", "test_excluded"}.

        configuration : Configuration
            The Configuration object which must include the
            following parameters (keys): {"subgroups", "candidate_column",
            "exclude_zero_scores", "exclude_listwise"}.

        Returns
        -------
        configuration : Configuration
            The input Configuration object that is passed through unmodified.

        data_container : DataContainer
            A new DataContainer object with the following DataFrames:

            - test_excluded_composition
            - data_composition
            - data_composition_by_*
        """
        frame_names = ["test_metadata", "test_excluded"]

        param_names = [
            "candidate_column",
            "subgroups",
            "exclude_zero_scores",
            "exclude_listwise",
        ]

        self.check_frame_names(data_container, frame_names)

        self.check_param_names(configuration, param_names)

        exclude_scores = configuration["exclude_zero_scores"]
        exclude_listwise = configuration["exclude_listwise"]

        subgroups = configuration["subgroups"]
        candidate_column = configuration["candidate_column"]

        # analyze excluded responses
        df_test_excluded = self.analyze_excluded_responses(
            data_container["test_excluded"],
            ["raw"],
            "Human/System",
            exclude_zero_scores=exclude_scores,
            exclude_listwise=exclude_listwise,
        )

        # rename the columns and index in the analysis data frame
        df_test_excluded.rename(
            columns={
                "all features numeric": "numeric system score",
                "non-numeric feature values": "non-numeric system score",
            },
            inplace=True,
        )
        df_data_composition = self.analyze_used_predictions(
            data_container["test_metadata"], subgroups, candidate_column
        )

        # create contingency table for each group
        data_composition_by_group_dict = {}
        for grouping_variable in subgroups:
            series_crosstab_group = pd.pivot_table(
                data_container["test_metadata"],
                values="spkitemid",
                index=[grouping_variable],
                aggfunc=len,
            )

            df_crosstab_group = pd.DataFrame(series_crosstab_group)
            df_crosstab_group.insert(0, grouping_variable, df_crosstab_group.index)
            df_crosstab_group.rename(columns={"spkitemid": "N responses"}, inplace=True)
            data_composition_by_group_dict[grouping_variable] = df_crosstab_group

        datasets: List[DatasetDict] = [
            DatasetDict({"name": "test_excluded_composition", "frame": df_test_excluded}),
            DatasetDict({"name": "data_composition", "frame": df_data_composition}),
        ]

        for group in data_composition_by_group_dict:
            datasets.append(
                DatasetDict(
                    {
                        "name": f"data_composition_by_{group}",
                        "frame": data_composition_by_group_dict[group],
                    }
                )
            )

        return configuration, DataContainer(datasets=datasets)