% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/evaluate_new_ratings.R
\name{evaluate_external_ratings}
\alias{evaluate_external_ratings}
\title{Evaluate how new typicality ratings predict human ratings and compares performance to LLM baselines}
\usage{
evaluate_external_ratings(
  df,
  method = "pearson",
  baselines = c("mean_gpt4_rating", "mean_llama3_rating"),
  verbose = TRUE
)
}
\arguments{
\item{df}{A data frame with columns \code{adjective}, \code{group}, and \code{rating}. Must contain
typicality scores for all 100 validation items used in the original study.}

\item{method}{The correlation method to use in \code{\link[stats:cor.test]{stats::cor.test()}}. Must be one of:
\code{"pearson"} (default), \code{"spearman"}, or \code{"kendall"}.}

\item{baselines}{Character vector of column names in the validation set to compare against
(default: \code{c("mean_gpt4_rating", "mean_llama3_rating")}).}

\item{verbose}{Logical. If \code{TRUE} (default), prints a summary of the correlations
and baseline comparisons. Set to \code{FALSE} to suppress console output.}
}
\value{
A tibble (invisibly) with one row per model (\code{external} and each baseline),
and columns \code{model}, \code{r}, and \code{p} for the correlation coefficient and p-value.
}
\description{
This function compares external typicality ratings (e.g., generated by a new LLM)
against the validation dataset included in 'baserater'. The validation set contains
average typicality ratings collected from 50 Prolific participants on a subset of
100 group–adjective pairs, as described in the accompanying paper.

The input ratings are merged with this reference set, and then:
\enumerate{
\item Computes a correlation (\code{cor.test}) between the external ratings and the human average;
\item Compares it to one or more built-in model baselines (default: 'GPT-4' and 'LLaMA 3.3');
\item Prints a clear summary of all correlation coefficients and flags whether the external model outperforms each baseline;
\item Returns a tidy result invisibly.
}
}
\examples{
\dontrun{
new_scores <- tibble::tibble(
  group = ratings$group,
  adjective = ratings$adjective,
  rating = runif(100)  # Replace with model predictions
)
evaluate_external_ratings(new_scores)
}
}
