Source code for rsmtool.utils.prmse

"""
PRMSE utilities.

Utility classes and functions related to computing test
theory based evaluations.

The derivations and formulas were provided by Matt Johnson.

:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import warnings
from typing import List, Optional, Union

import numpy as np
import pandas as pd


def get_n_human_scores(human_scores: np.ndarray) -> np.ndarray:
    """
    Get the number of available human scores for each response.

    Parameters
    ----------
    human_scores : numpy.ndarray
        Human ratings for each response of shape (n_samples, n_ratings).

    Returns
    -------
    n_scores : numpy.ndarray
        Total number of human scores of shape (n_samples, ). Only includes
        scores that are not NaN.
    """
    n_scores = (~np.isnan(human_scores)).sum(axis=1)
    return n_scores


[docs] def variance_of_errors(human_scores: np.ndarray) -> Optional[float]: """ Estimate the variance of errors in human scores. Parameters ---------- human_scores : numpy.ndarray Human ratings for each response of shape (n_samples, n_ratings). Returns ------- variance_of_errors : Optional[float] Estimated variance of errors in human scores. If the variance of errors cannot be estimated from the data, returns ``None``. """ # we first compute the total number of scores # available for each response n_scores = get_n_human_scores(human_scores) # we will only be using responses with more # than one score multiple_mask = n_scores > 1 # show a warning and return None # if we don't have valid human scores if multiple_mask.sum() == 0: warnings.warn( "True score evaluations cannot be computed because none of the " "responses in the evaluation set has valid system scores and 2 " "human scores." ) return None else: # only select the responses with multiple scores multiple_scores = human_scores[multiple_mask] n_scores = n_scores[multiple_mask] # now let's compute the rater error variance for each # response response_variances = np.nanvar(multiple_scores, ddof=1, axis=1) # finally, let's compute the variance of errors as a weighted average # of response variances variance_of_errors = np.average(response_variances, weights=n_scores - 1) return variance_of_errors
def true_score_variance( human_scores: np.ndarray, variance_errors_human: Optional[float] = None ) -> Optional[float]: """ Compute variance of true scores for multiple raters. Parameters ---------- human_scores : numpy.ndarray Human ratings for each response of shape (n_samples, n_ratings). variance_errors_human : Optional[float] Estimated variance of errors in human scores. If ``None``, the variance will be estimated from the data. In this case at least some responses *must* have more than one human score. Defaults to ``None``. Returns ------- variance_true_scores : Optional[float] Variance of true scores. If the variance of errors in human scores is not available and cannot be estimated from the data, returns ``None``. """ # if we don't have variance of errors, compute it # from the data if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) # if it's still None, return None if variance_errors_human is None: return None else: # compute mean human score and total number of scores # for each response mean_scores = np.nanmean(human_scores, axis=1) n_scores = get_n_human_scores(human_scores) # compute overall mean mean_human_score = np.nanmean(human_scores) # let N be total number of responses N = len(human_scores) # let M be total number of human ratings M = n_scores.sum() # compute squared deviations squared_devs = (mean_scores - mean_human_score) ** 2 # adjust them by the number of human scores available # for each responses: deviations with higher number of # human scores are assigned a greater weight adjusted_squared_devs = n_scores * squared_devs # compute sum of squares sum_of_squares = adjusted_squared_devs.sum() # now compute the numerator as sum of squares # adjusted for the variance of human errors numerator = sum_of_squares - (N - 1) * variance_errors_human # compute the denominator as the adjusted total number of scores denominator = M - ((n_scores**2).sum() / M) # finally compute variance of true scores variance_true_scores = numerator / denominator return variance_true_scores def mse_true( system: np.ndarray, human_scores: np.ndarray, variance_errors_human: Optional[float] = None ) -> Optional[float]: """ Compute mean squared error (MSE) when predicting true score from system score. Parameters ---------- system : numpy.ndarray System scores for each response of shape (n_samples,). human_scores : numpy.ndarray Human ratings for each response of shape (n_samples, n_ratings). variance_errors_human : Optional[float] Estimated variance of errors in human scores. If ``None``, the variance will be estimated from the data. In this case at least some responses *must* have more than one human score. Defaults to ``None``. Returns ------- variance_true_scores : Optional[float] Variance of true scores. If the variance of errors in human scores is not available and cannot be estimated from the data, returns ``None``. """ # if we don't have variance of errors, compute it # from the data if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) # if it's still None, return None if variance_errors_human is None: return None else: # get total number of scores for each response n_scores = get_n_human_scores(human_scores) mean_scores = np.nanmean(human_scores, axis=1) N = len(system) se = ((mean_scores - system) ** 2) * n_scores # Compute mean squared error when predicting true score mse = (se.sum() - N * variance_errors_human) / n_scores.sum() return mse
[docs] def prmse_true( system: np.ndarray, human_scores: np.ndarray, variance_errors_human: Optional[float] = None ) -> Optional[float]: """ Compute PRMSE when predicting true score from system scores. PRMSE = Proportional Reduction in Mean Squared Error. The formula to compute PRMSE implemented in RSMTool was derived at ETS by Matthew S. Johnson. See `Loukina et al. (2020) <https://aclanthology.org/2020.bea-1.2.pdf>`_ for further information about PRMSE. Parameters ---------- system : numpy.ndarray System scores for each response of shape (n_samples,). human_scores : numpy.ndarray Human ratings for each response of shape (n_samples, n_ratings). variance_errors_human : Optional[float] Estimated variance of errors in human scores. If ``None``, the variance will be estimated from the data. In this case at least some responses *must* have more than one human score. Defaults to ``None``. Returns ------- prmse : Optional[float] Proportional reduction in mean squared error. If the variance of errors in human scores is not available and cannot be estimated from the data, returns ``None``. Raises ------ ValueError If variance of true scores or MSE could not be computed. """ # check that human_scors is a two dimensional array # and reshape if necessary if len(human_scores.shape) == 1: current_length = human_scores.shape[0] # first assume we have a pandas series try: human_scores = human_scores.values.reshape(current_length, 1) # if not, treat this as an array except AttributeError: human_scores = human_scores.reshape(current_length, 1) if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) # if it's still None, return None if variance_errors_human is None: return None else: variance_true = true_score_variance(human_scores, variance_errors_human) mse = mse_true(system, human_scores, variance_errors_human) if variance_true is None or mse is None: raise ValueError("Variance of true scores or MSE could not be computed. ") prmse = 1 - (mse / variance_true) return prmse
def get_true_score_evaluations( df: pd.DataFrame, system_score_columns: Union[str, List[str]], human_score_columns: Union[str, List[str]], variance_errors_human: Optional[float] = None, ) -> pd.DataFrame: """ Generate true score evaluations for HTML reports. Parameters ---------- df: pandas.DataFrame Input data frame. Must contain columns listed in ``system_score_columns`` and ``human_score_columns``. system_score_columns: Union[str, List[str] System score column name or list of columns containing system scores. human_score_columns: Union[str, List[str] Human score column or list of columns containing human scores. True score evaluations require estimating variance of human errors, which can only be computed when a subset of responses has two or more human ratings. If ``human_score_columns`` is a single column name, ``variance_errors_human`` *must* also be specified. variance_errors_human : Optional[float] Estimated variance of errors in human scores. If ``None``, the variance will be estimated from the data in which case some responses *must* have more than one human rating. Defaults to ``None``. Returns ------- prmse_metrics: pandas.DataFrame DataFrame containing different evaluation metrics related to the evaluation of system scores against true scores. The column names are: - ``"N"``: total number of responses - ``"N raters"``: maximum number of ratings available for a single response - ``"N_single"``: total number of responses with a single human score - ``"N_multiple"``: total number of responses with more than one human score - ``"variance_of_errors"``: estimated variance of human errors - ``"tru_var"``: estimated true score variance - ``"mse_true"``: mean squared error when predicting true score from machine score - ``"prmse"``: proportional reduction in mean squared error when predicting true score Raises ------ ValueError If ``human_score_columns`` is a single column name and ``variance_errors_human`` is not specified. """ # check that if we only have one human column, we were also given # variance of errors if isinstance(human_score_columns, str): if variance_errors_human is None: raise ( ValueError( "True score evaluations require estimating " "variance of human errors, " "which can only be computed when a subset " "of responses has two or more human ratings. " "If a single human_score_column " "is supplied, one must also specify variance_errors_human" ) ) if isinstance(system_score_columns, str): system_score_columns = [system_score_columns] # compute variance of errors if it wasn't specified if variance_errors_human is None: variance_errors_human = variance_of_errors(df[human_score_columns]) # compute prmse prmse_all = [] for system in system_score_columns: mse = mse_true(df[system], df[human_score_columns], variance_errors_human) prmse = prmse_true(df[system], df[human_score_columns], variance_errors_human) prmse_metrics = pd.Series({"MSE true": mse, "PRMSE true": prmse}, name=system) prmse_all.append(prmse_metrics) df_prmse = pd.concat(prmse_all, axis=1, sort=True).transpose() score_counts = get_n_human_scores(df[human_score_columns]) # compute values that are the same for all scores df_prmse.insert(0, "N", len(df)) df_prmse.insert(1, "N raters", score_counts.max()) df_prmse.insert(2, "N single", (score_counts == 1).sum()), df_prmse.insert(3, "N multiple", (score_counts > 1).sum()), df_prmse.insert(4, "Variance of errors", variance_errors_human) df_prmse.insert( 5, "True score var", true_score_variance(df[human_score_columns], variance_errors_human), ) return df_prmse