Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/metric-list.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ These metrics need the model to generate an output. They are therefore slower.
- normalization on string pre-comparision on whitespace, articles, capitalization, ....
- comparing the full string, or only subsets (prefix, suffix, ...)
- `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction.
- `bayes_at_n`: Corpus-level Bayes@N for repeated generations. It reports `bayes@n`, the posterior mean, and `bayes@n_sigma`, the posterior standard deviation; multi-category outcomes require category weights.
- `f1_score`: Average F1 score in terms of word overlap between the model output and gold (normalisation optional).
- `f1_score_macro`: Corpus level macro F1 score.
- `f1_score_macro`: Corpus level micro F1 score.
Expand Down Expand Up @@ -54,7 +55,7 @@ These metrics need the model to generate an output. They are therefore slower.
- `edit_distance`: Average Levenshtein edit distance between model generation and reference,
- `edit_similarity`: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference.
- Math:
- Both `exact_match` and `maj_at_k` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex.
- `exact_match`, `maj_at_k`, and `bayes_at_n` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex.

## LLM-as-Judge
- `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API.
Expand Down
160 changes: 160 additions & 0 deletions src/lighteval/metrics/bayes_at_n.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# DEALINGS IN THE SOFTWARE.

"""Bayes@N posterior moments for repeated categorical outcomes."""

from collections.abc import Sequence

import numpy as np


def _as_2d_int_matrix(values: Sequence[Sequence[int]] | np.ndarray, name: str) -> np.ndarray:
try:
matrix = np.asarray(values)
except ValueError as exc:
raise ValueError(f"{name} must be a rectangular 1D or 2D array.") from exc

if matrix.ndim == 1:
matrix = matrix.reshape(1, -1)
elif matrix.ndim != 2:
raise ValueError(f"{name} must be a 1D or 2D array.")

if matrix.shape[0] == 0:
raise ValueError(f"{name} must contain at least one row.")

if matrix.dtype == np.dtype("bool"):
return matrix.astype(int)
if np.issubdtype(matrix.dtype, np.integer):
return matrix.astype(int, copy=False)
if not np.issubdtype(matrix.dtype, np.number):
raise ValueError(f"{name} entries must be integer category ids.")

float_matrix = matrix.astype(float)
if not np.all(np.isfinite(float_matrix)):
raise ValueError(f"{name} entries must be finite integer category ids.")
if not np.all(float_matrix == np.floor(float_matrix)):
raise ValueError(f"{name} entries must be integer category ids.")
return float_matrix.astype(int)


def _as_weights(weights: Sequence[float] | np.ndarray | None, R: np.ndarray) -> np.ndarray:
if weights is None:
unique_values = np.unique(R)
if np.all(np.isin(unique_values, [0, 1])):
return np.array([0.0, 1.0])

unique_str = ", ".join(str(value) for value in unique_values)
raise ValueError(
f"R contains non-binary category ids ({unique_str}); pass weights to score multi-category outcomes."
)

weight_array = np.asarray(weights, dtype=float)
if weight_array.ndim != 1:
raise ValueError("weights must be a 1D array.")
if weight_array.size == 0:
raise ValueError("weights must contain at least one value.")
if not np.all(np.isfinite(weight_array)):
raise ValueError("weights must contain only finite values.")
return weight_array


def _validate_matrix_range(matrix: np.ndarray, low: int, high: int, name: str) -> None:
if matrix.size == 0:
return
if matrix.min() < low or matrix.max() > high:
raise ValueError(f"{name} entries must be integers in [{low}, {high}].")


def _row_bincount(matrix: np.ndarray, length: int) -> np.ndarray:
if matrix.shape[1] == 0:
return np.zeros((matrix.shape[0], length), dtype=int)

counts = np.zeros((matrix.shape[0], length), dtype=int)
rows = np.repeat(np.arange(matrix.shape[0]), matrix.shape[1])
np.add.at(counts, (rows, matrix.ravel()), 1)
return counts


def _as_prior_matrix(
prior: Sequence[Sequence[int]] | np.ndarray | None,
num_rows: int,
) -> np.ndarray:
if prior is None:
return np.zeros((num_rows, 0), dtype=int)

prior_matrix = _as_2d_int_matrix(prior, "prior")
if prior_matrix.ndim == 1:
prior_matrix = prior_matrix.reshape(1, -1)
if prior_matrix.shape[0] != num_rows:
if prior_matrix.size % num_rows != 0:
raise ValueError("prior must have the same number of rows as R.")
prior_matrix = prior_matrix.reshape(num_rows, -1)
return prior_matrix


def bayes_at_n(
R: Sequence[Sequence[int]] | np.ndarray,
weights: Sequence[float] | np.ndarray | None = None,
prior: Sequence[Sequence[int]] | np.ndarray | None = None,
) -> tuple[float, float]:
"""Return the Bayes@N posterior mean and standard deviation.

Args:
R: ``M x N`` matrix of integer category ids. A 1D array is treated as
one row.
weights: Category score weights. If omitted, ``R`` must be binary and
weights ``[0.0, 1.0]`` are used.
prior: Optional ``M x D`` matrix of row-aligned prior observations.

Returns:
``(mu, sigma)``, where ``mu`` is the posterior mean and ``sigma`` is the
posterior standard deviation.
"""
outcome_matrix = _as_2d_int_matrix(R, "R")
if outcome_matrix.shape[1] == 0:
raise ValueError("R must contain at least one outcome per row.")

weight_array = _as_weights(weights, outcome_matrix)
num_rows, num_samples = outcome_matrix.shape
max_category = weight_array.size - 1
prior_matrix = _as_prior_matrix(prior, num_rows)

_validate_matrix_range(outcome_matrix, 0, max_category, "R")
_validate_matrix_range(prior_matrix, 0, max_category, "prior")

prior_samples = prior_matrix.shape[1]
total_count = 1 + max_category + prior_samples + num_samples

outcome_counts = _row_bincount(outcome_matrix, max_category + 1)
prior_counts = _row_bincount(prior_matrix, max_category + 1) + 1
posterior_counts = outcome_counts + prior_counts

delta_weights = weight_array - weight_array[0]
mu = weight_array[0] + (posterior_counts @ delta_weights).sum() / (num_rows * total_count)

posterior_probs = posterior_counts / total_count
second_moment = (posterior_probs * (delta_weights**2)).sum(axis=1)
squared_mean = (posterior_probs @ delta_weights) ** 2
sigma = np.sqrt((second_moment - squared_mean).sum() / (num_rows**2 * (total_count + 1)))

return float(mu), float(sigma)


__all__ = ["bayes_at_n"]
39 changes: 39 additions & 0 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from lighteval.metrics.harness_compatibility.drop import DropMetrics
from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics
from lighteval.metrics.metrics_corpus import (
BayesAtNCorpus,
CorpusLevelF1Score,
CorpusLevelPerplexityMetric,
CorpusLevelTranslationMetric,
Expand All @@ -44,6 +45,7 @@
ROUGE,
AccGoldLikelihood,
AvgAtN,
BayesAtN,
BertScore,
ExactMatches,
Extractiveness,
Expand Down Expand Up @@ -172,6 +174,43 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
bayes_at_n = CorpusLevelMetricGrouping(
metric_name=["bayes@n", "bayes@n_sigma"],
sample_level_fn=BayesAtN(strip_strings=True),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
"bayes@n": BayesAtNCorpus("mu"),
"bayes@n_sigma": BayesAtNCorpus("sigma"),
},
higher_is_better={
"bayes@n": True,
"bayes@n_sigma": False,
},
)
bayes_at_n_math = CorpusLevelMetricGrouping(
metric_name=["math-bayes@n", "math-bayes@n_sigma"],
sample_level_fn=BayesAtN(
name_prefix="math",
strip_strings=True,
sample_scoring_function=MultilingualExtractiveMatchMetric(
language=Language.ENGLISH,
fallback_mode="first_match",
precision=5,
gold_extraction_target=(ExprExtractionConfig(),),
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
aggregation_function=max,
),
),
category=SamplingMethod.GENERATIVE,
corpus_level_fn={
"math-bayes@n": BayesAtNCorpus("mu"),
"math-bayes@n_sigma": BayesAtNCorpus("sigma"),
},
higher_is_better={
"math-bayes@n": True,
"math-bayes@n_sigma": False,
},
)
bert_score = SampleLevelMetricGrouping(
metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip),
Expand Down
91 changes: 91 additions & 0 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import sacrebleu
import sklearn.metrics

from lighteval.metrics.bayes_at_n import bayes_at_n
from lighteval.metrics.sample_preparator import (
GenerativeCorpusMetricInput,
LogprobCorpusMetricInput,
Expand Down Expand Up @@ -62,6 +63,96 @@ def __str__(self):
return f"{self.__class__.__name__}({', '.join(attr_strs)})"


def _is_repeated_full_bayes_prior(non_null_priors: list[object], first_prior: np.ndarray, num_rows: int) -> bool:
if not all(np.array_equal(np.asarray(prior), first_prior) for prior in non_null_priors):
return False
return (first_prior.ndim == 2 and first_prior.shape[0] == num_rows) or (
first_prior.ndim == 1 and num_rows == 1
)


def _coerce_bayes_prior_row(prior: object) -> list[int]:
prior_array = np.asarray(prior)
if prior_array.ndim == 0:
raise ValueError("Bayes@N prior rows must be 1D arrays.")
if prior_array.ndim == 2:
if prior_array.shape[0] != 1:
raise ValueError("Bayes@N row-level prior payloads must contain exactly one row.")
prior_array = prior_array.reshape(-1)
elif prior_array.ndim != 1:
raise ValueError("Bayes@N row-level prior payloads must be 1D arrays.")
return prior_array.tolist()


def _coerce_bayes_prior(priors: list[object | None], num_rows: int) -> list[list[int]] | object | None:
non_null_priors = [prior for prior in priors if prior is not None]
if not non_null_priors:
return None
if len(non_null_priors) != len(priors):
raise ValueError("Bayes@N prior observations must be provided for every row or omitted for every row.")

first_prior = np.asarray(non_null_priors[0])
if _is_repeated_full_bayes_prior(non_null_priors, first_prior, num_rows):
return non_null_priors[0]

prior_rows = [_coerce_bayes_prior_row(prior) for prior in non_null_priors]
prior_lengths = {len(row) for row in prior_rows}
if len(prior_lengths) != 1:
raise ValueError("Bayes@N prior rows must all have the same number of observations.")
return prior_rows


def _coerce_bayes_items(items: list[dict | list[int]]) -> tuple[list[list[int]], list[float] | None, object | None]:
if len(items) == 0:
raise ValueError("Bayes@N needs at least one row.")

rows = []
weights = None
priors = []
for item in items:
if isinstance(item, dict):
if "scores" not in item:
raise ValueError("Bayes@N payloads must contain a 'scores' row.")
row = item["scores"]
item_weights = item.get("weights")
priors.append(item.get("prior"))
else:
row = item
item_weights = None
priors.append(None)

row = list(row)
if len(row) == 0:
raise ValueError("Bayes@N rows must contain at least one score.")
rows.append(row)

if item_weights is not None:
item_weights = np.asarray(item_weights, dtype=float)
if weights is None:
weights = item_weights
elif not np.array_equal(weights, item_weights):
raise ValueError("Bayes@N received inconsistent weights across rows.")

row_lengths = {len(row) for row in rows}
if len(row_lengths) != 1:
raise ValueError("Bayes@N requires every row to have the same number of scores.")

weights_list = weights.tolist() if weights is not None else None
return rows, weights_list, _coerce_bayes_prior(priors, len(rows))


class BayesAtNCorpus(CorpusLevelComputation):
def __init__(self, statistic: Literal["mu", "sigma"]):
if statistic not in {"mu", "sigma"}:
raise ValueError("BayesAtNCorpus statistic must be either 'mu' or 'sigma'.")
self.statistic = statistic

def compute_corpus(self, items: list[dict | list[int]]) -> float:
rows, weights, prior = _coerce_bayes_items(items)
mu, sigma = bayes_at_n(rows, weights=weights, prior=prior)
return mu if self.statistic == "mu" else sigma


# General aggregations
class MatthewsCorrCoef(CorpusLevelComputation):
def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float:
Expand Down
Loading