"""
PRMSE utilities.
Utility classes and functions related to computing test
theory based evaluations.
The derivations and formulas were provided by Matt Johnson.
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import warnings
from typing import List, Optional, Union
import numpy as np
import pandas as pd
def get_n_human_scores(human_scores: np.ndarray) -> np.ndarray:
"""
Get the number of available human scores for each response.
Parameters
----------
human_scores : numpy.ndarray
Human ratings for each response of shape (n_samples, n_ratings).
Returns
-------
n_scores : numpy.ndarray
Total number of human scores of shape (n_samples, ). Only includes
scores that are not NaN.
"""
n_scores = (~np.isnan(human_scores)).sum(axis=1)
return n_scores
[docs]
def variance_of_errors(human_scores: np.ndarray) -> Optional[float]:
"""
Estimate the variance of errors in human scores.
Parameters
----------
human_scores : numpy.ndarray
Human ratings for each response of shape (n_samples, n_ratings).
Returns
-------
variance_of_errors : Optional[float]
Estimated variance of errors in human scores. If the variance of errors
cannot be estimated from the data, returns ``None``.
"""
# we first compute the total number of scores
# available for each response
n_scores = get_n_human_scores(human_scores)
# we will only be using responses with more
# than one score
multiple_mask = n_scores > 1
# show a warning and return None
# if we don't have valid human scores
if multiple_mask.sum() == 0:
warnings.warn(
"True score evaluations cannot be computed because none of the "
"responses in the evaluation set has valid system scores and 2 "
"human scores."
)
return None
else:
# only select the responses with multiple scores
multiple_scores = human_scores[multiple_mask]
n_scores = n_scores[multiple_mask]
# now let's compute the rater error variance for each
# response
response_variances = np.nanvar(multiple_scores, ddof=1, axis=1)
# finally, let's compute the variance of errors as a weighted average
# of response variances
variance_of_errors = np.average(response_variances, weights=n_scores - 1)
return variance_of_errors
def true_score_variance(
human_scores: np.ndarray, variance_errors_human: Optional[float] = None
) -> Optional[float]:
"""
Compute variance of true scores for multiple raters.
Parameters
----------
human_scores : numpy.ndarray
Human ratings for each response of shape (n_samples, n_ratings).
variance_errors_human : Optional[float]
Estimated variance of errors in human scores. If ``None``, the variance
will be estimated from the data. In this case at least some responses
*must* have more than one human score.
Defaults to ``None``.
Returns
-------
variance_true_scores : Optional[float]
Variance of true scores. If the variance of errors in human scores
is not available and cannot be estimated from the data, returns ``None``.
"""
# if we don't have variance of errors, compute it
# from the data
if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)
# if it's still None, return None
if variance_errors_human is None:
return None
else:
# compute mean human score and total number of scores
# for each response
mean_scores = np.nanmean(human_scores, axis=1)
n_scores = get_n_human_scores(human_scores)
# compute overall mean
mean_human_score = np.nanmean(human_scores)
# let N be total number of responses
N = len(human_scores)
# let M be total number of human ratings
M = n_scores.sum()
# compute squared deviations
squared_devs = (mean_scores - mean_human_score) ** 2
# adjust them by the number of human scores available
# for each responses: deviations with higher number of
# human scores are assigned a greater weight
adjusted_squared_devs = n_scores * squared_devs
# compute sum of squares
sum_of_squares = adjusted_squared_devs.sum()
# now compute the numerator as sum of squares
# adjusted for the variance of human errors
numerator = sum_of_squares - (N - 1) * variance_errors_human
# compute the denominator as the adjusted total number of scores
denominator = M - ((n_scores**2).sum() / M)
# finally compute variance of true scores
variance_true_scores = numerator / denominator
return variance_true_scores
def mse_true(
system: np.ndarray, human_scores: np.ndarray, variance_errors_human: Optional[float] = None
) -> Optional[float]:
"""
Compute mean squared error (MSE) when predicting true score from system score.
Parameters
----------
system : numpy.ndarray
System scores for each response of shape (n_samples,).
human_scores : numpy.ndarray
Human ratings for each response of shape (n_samples, n_ratings).
variance_errors_human : Optional[float]
Estimated variance of errors in human scores. If ``None``, the variance
will be estimated from the data. In this case at least some responses
*must* have more than one human score.
Defaults to ``None``.
Returns
-------
variance_true_scores : Optional[float]
Variance of true scores. If the variance of errors in human scores
is not available and cannot be estimated from the data, returns ``None``.
"""
# if we don't have variance of errors, compute it
# from the data
if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)
# if it's still None, return None
if variance_errors_human is None:
return None
else:
# get total number of scores for each response
n_scores = get_n_human_scores(human_scores)
mean_scores = np.nanmean(human_scores, axis=1)
N = len(system)
se = ((mean_scores - system) ** 2) * n_scores
# Compute mean squared error when predicting true score
mse = (se.sum() - N * variance_errors_human) / n_scores.sum()
return mse
[docs]
def prmse_true(
system: np.ndarray, human_scores: np.ndarray, variance_errors_human: Optional[float] = None
) -> Optional[float]:
"""
Compute PRMSE when predicting true score from system scores.
PRMSE = Proportional Reduction in Mean Squared Error.
The formula to compute PRMSE implemented in RSMTool
was derived at ETS by Matthew S. Johnson. See
`Loukina et al. (2020) <https://aclanthology.org/2020.bea-1.2.pdf>`_
for further information about PRMSE.
Parameters
----------
system : numpy.ndarray
System scores for each response of shape (n_samples,).
human_scores : numpy.ndarray
Human ratings for each response of shape (n_samples, n_ratings).
variance_errors_human : Optional[float]
Estimated variance of errors in human scores. If ``None``, the variance
will be estimated from the data. In this case at least some responses
*must* have more than one human score.
Defaults to ``None``.
Returns
-------
prmse : Optional[float]
Proportional reduction in mean squared error. If the variance of errors
in human scores is not available and cannot be estimated from the data,
returns ``None``.
Raises
------
ValueError
If variance of true scores or MSE could not be computed.
"""
# check that human_scors is a two dimensional array
# and reshape if necessary
if len(human_scores.shape) == 1:
current_length = human_scores.shape[0]
# first assume we have a pandas series
try:
human_scores = human_scores.values.reshape(current_length, 1)
# if not, treat this as an array
except AttributeError:
human_scores = human_scores.reshape(current_length, 1)
if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)
# if it's still None, return None
if variance_errors_human is None:
return None
else:
variance_true = true_score_variance(human_scores, variance_errors_human)
mse = mse_true(system, human_scores, variance_errors_human)
if variance_true is None or mse is None:
raise ValueError("Variance of true scores or MSE could not be computed. ")
prmse = 1 - (mse / variance_true)
return prmse
def get_true_score_evaluations(
df: pd.DataFrame,
system_score_columns: Union[str, List[str]],
human_score_columns: Union[str, List[str]],
variance_errors_human: Optional[float] = None,
) -> pd.DataFrame:
"""
Generate true score evaluations for HTML reports.
Parameters
----------
df: pandas.DataFrame
Input data frame. Must contain columns listed in ``system_score_columns``
and ``human_score_columns``.
system_score_columns: Union[str, List[str]
System score column name or list of columns containing system scores.
human_score_columns: Union[str, List[str]
Human score column or list of columns containing human scores.
True score evaluations require estimating variance of human errors,
which can only be computed when a subset of responses has two or more
human ratings. If ``human_score_columns`` is a single column name,
``variance_errors_human`` *must* also be specified.
variance_errors_human : Optional[float]
Estimated variance of errors in human scores. If ``None``, the variance
will be estimated from the data in which case some responses *must* have
more than one human rating.
Defaults to ``None``.
Returns
-------
prmse_metrics: pandas.DataFrame
DataFrame containing different evaluation metrics related to the evaluation
of system scores against true scores. The column names are:
- ``"N"``: total number of responses
- ``"N raters"``: maximum number of ratings available for a single response
- ``"N_single"``: total number of responses with a single human score
- ``"N_multiple"``: total number of responses with more than one
human score
- ``"variance_of_errors"``: estimated variance of human errors
- ``"tru_var"``: estimated true score variance
- ``"mse_true"``: mean squared error when predicting true score from
machine score
- ``"prmse"``: proportional reduction in mean squared error when
predicting true score
Raises
------
ValueError
If ``human_score_columns`` is a single column name and ``variance_errors_human``
is not specified.
"""
# check that if we only have one human column, we were also given
# variance of errors
if isinstance(human_score_columns, str):
if variance_errors_human is None:
raise (
ValueError(
"True score evaluations require estimating "
"variance of human errors, "
"which can only be computed when a subset "
"of responses has two or more human ratings. "
"If a single human_score_column "
"is supplied, one must also specify variance_errors_human"
)
)
if isinstance(system_score_columns, str):
system_score_columns = [system_score_columns]
# compute variance of errors if it wasn't specified
if variance_errors_human is None:
variance_errors_human = variance_of_errors(df[human_score_columns])
# compute prmse
prmse_all = []
for system in system_score_columns:
mse = mse_true(df[system], df[human_score_columns], variance_errors_human)
prmse = prmse_true(df[system], df[human_score_columns], variance_errors_human)
prmse_metrics = pd.Series({"MSE true": mse, "PRMSE true": prmse}, name=system)
prmse_all.append(prmse_metrics)
df_prmse = pd.concat(prmse_all, axis=1, sort=True).transpose()
score_counts = get_n_human_scores(df[human_score_columns])
# compute values that are the same for all scores
df_prmse.insert(0, "N", len(df))
df_prmse.insert(1, "N raters", score_counts.max())
df_prmse.insert(2, "N single", (score_counts == 1).sum()),
df_prmse.insert(3, "N multiple", (score_counts > 1).sum()),
df_prmse.insert(4, "Variance of errors", variance_errors_human)
df_prmse.insert(
5,
"True score var",
true_score_variance(df[human_score_columns], variance_errors_human),
)
return df_prmse