huggingface · mohsenhariri · Apr 28, 2026
diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
@@ -26,6 +26,7 @@ These metrics need the model to generate an output. They are therefore slower.
         - normalization on string pre-comparision on whitespace, articles, capitalization, ....
         - comparing the full string, or only subsets (prefix, suffix, ...)
     - `maj_at_k`: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction.
+    - `bayes_at_n`: Corpus-level Bayes@N for repeated generations. It reports `bayes@n`, the posterior mean, and `bayes@n_sigma`, the posterior standard deviation; multi-category outcomes require category weights.
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold (normalisation optional).
     - `f1_score_macro`: Corpus level macro F1 score.
     - `f1_score_macro`: Corpus level micro F1 score.
@@ -54,7 +55,7 @@ These metrics need the model to generate an output. They are therefore slower.
         - `edit_distance`: Average Levenshtein edit distance between model generation and reference,
         - `edit_similarity`: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference.
 - Math:
-    - Both `exact_match` and `maj_at_k` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex.
+    - `exact_match`, `maj_at_k`, and `bayes_at_n` can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex.
 
 ## LLM-as-Judge
 - `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API.

diff --git a/src/lighteval/metrics/bayes_at_n.py b/src/lighteval/metrics/bayes_at_n.py
@@ -0,0 +1,160 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+"""Bayes@N posterior moments for repeated categorical outcomes."""
+
+from collections.abc import Sequence
+
+import numpy as np
+
+
+def _as_2d_int_matrix(values: Sequence[Sequence[int]] | np.ndarray, name: str) -> np.ndarray:
+    try:
+        matrix = np.asarray(values)
+    except ValueError as exc:
+        raise ValueError(f"{name} must be a rectangular 1D or 2D array.") from exc
+
+    if matrix.ndim == 1:
+        matrix = matrix.reshape(1, -1)
+    elif matrix.ndim != 2:
+        raise ValueError(f"{name} must be a 1D or 2D array.")
+
+    if matrix.shape[0] == 0:
+        raise ValueError(f"{name} must contain at least one row.")
+
+    if matrix.dtype == np.dtype("bool"):
+        return matrix.astype(int)
+    if np.issubdtype(matrix.dtype, np.integer):
+        return matrix.astype(int, copy=False)
+    if not np.issubdtype(matrix.dtype, np.number):
+        raise ValueError(f"{name} entries must be integer category ids.")
+
+    float_matrix = matrix.astype(float)
+    if not np.all(np.isfinite(float_matrix)):
+        raise ValueError(f"{name} entries must be finite integer category ids.")
+    if not np.all(float_matrix == np.floor(float_matrix)):
+        raise ValueError(f"{name} entries must be integer category ids.")
+    return float_matrix.astype(int)
+
+
+def _as_weights(weights: Sequence[float] | np.ndarray | None, R: np.ndarray) -> np.ndarray:
+    if weights is None:
+        unique_values = np.unique(R)
+        if np.all(np.isin(unique_values, [0, 1])):
+            return np.array([0.0, 1.0])
+
+        unique_str = ", ".join(str(value) for value in unique_values)
+        raise ValueError(
+            f"R contains non-binary category ids ({unique_str}); pass weights to score multi-category outcomes."
+        )
+
+    weight_array = np.asarray(weights, dtype=float)
+    if weight_array.ndim != 1:
+        raise ValueError("weights must be a 1D array.")
+    if weight_array.size == 0:
+        raise ValueError("weights must contain at least one value.")
+    if not np.all(np.isfinite(weight_array)):
+        raise ValueError("weights must contain only finite values.")
+    return weight_array
+
+
+def _validate_matrix_range(matrix: np.ndarray, low: int, high: int, name: str) -> None:
+    if matrix.size == 0:
+        return
+    if matrix.min() < low or matrix.max() > high:
+        raise ValueError(f"{name} entries must be integers in [{low}, {high}].")
+
+
+def _row_bincount(matrix: np.ndarray, length: int) -> np.ndarray:
+    if matrix.shape[1] == 0:
+        return np.zeros((matrix.shape[0], length), dtype=int)
+
+    counts = np.zeros((matrix.shape[0], length), dtype=int)
+    rows = np.repeat(np.arange(matrix.shape[0]), matrix.shape[1])
+    np.add.at(counts, (rows, matrix.ravel()), 1)
+    return counts
+
+
+def _as_prior_matrix(
+    prior: Sequence[Sequence[int]] | np.ndarray | None,
+    num_rows: int,
+) -> np.ndarray:
+    if prior is None:
+        return np.zeros((num_rows, 0), dtype=int)
+
+    prior_matrix = _as_2d_int_matrix(prior, "prior")
+    if prior_matrix.ndim == 1:
+        prior_matrix = prior_matrix.reshape(1, -1)
+    if prior_matrix.shape[0] != num_rows:
+        if prior_matrix.size % num_rows != 0:
+            raise ValueError("prior must have the same number of rows as R.")
+        prior_matrix = prior_matrix.reshape(num_rows, -1)
+    return prior_matrix
+
+
+def bayes_at_n(
+    R: Sequence[Sequence[int]] | np.ndarray,
+    weights: Sequence[float] | np.ndarray | None = None,
+    prior: Sequence[Sequence[int]] | np.ndarray | None = None,
+) -> tuple[float, float]:
+    """Return the Bayes@N posterior mean and standard deviation.
+
+    Args:
+        R: ``M x N`` matrix of integer category ids. A 1D array is treated as
+            one row.
+        weights: Category score weights. If omitted, ``R`` must be binary and
+            weights ``[0.0, 1.0]`` are used.
+        prior: Optional ``M x D`` matrix of row-aligned prior observations.
+
+    Returns:
+        ``(mu, sigma)``, where ``mu`` is the posterior mean and ``sigma`` is the
+        posterior standard deviation.
+    """
+    outcome_matrix = _as_2d_int_matrix(R, "R")
+    if outcome_matrix.shape[1] == 0:
+        raise ValueError("R must contain at least one outcome per row.")
+
+    weight_array = _as_weights(weights, outcome_matrix)
+    num_rows, num_samples = outcome_matrix.shape
+    max_category = weight_array.size - 1
+    prior_matrix = _as_prior_matrix(prior, num_rows)
+
+    _validate_matrix_range(outcome_matrix, 0, max_category, "R")
+    _validate_matrix_range(prior_matrix, 0, max_category, "prior")
+
+    prior_samples = prior_matrix.shape[1]
+    total_count = 1 + max_category + prior_samples + num_samples
+
+    outcome_counts = _row_bincount(outcome_matrix, max_category + 1)
+    prior_counts = _row_bincount(prior_matrix, max_category + 1) + 1
+    posterior_counts = outcome_counts + prior_counts
+
+    delta_weights = weight_array - weight_array[0]
+    mu = weight_array[0] + (posterior_counts @ delta_weights).sum() / (num_rows * total_count)
+
+    posterior_probs = posterior_counts / total_count
+    second_moment = (posterior_probs * (delta_weights**2)).sum(axis=1)
+    squared_mean = (posterior_probs @ delta_weights) ** 2
+    sigma = np.sqrt((second_moment - squared_mean).sum() / (num_rows**2 * (total_count + 1)))
+
+    return float(mu), float(sigma)
+
+
+__all__ = ["bayes_at_n"]
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -32,6 +32,7 @@
 from lighteval.metrics.harness_compatibility.drop import DropMetrics
 from lighteval.metrics.harness_compatibility.truthful_qa import TruthfulqaMCMetrics
 from lighteval.metrics.metrics_corpus import (
+    BayesAtNCorpus,
     CorpusLevelF1Score,
     CorpusLevelPerplexityMetric,
     CorpusLevelTranslationMetric,
@@ -44,6 +45,7 @@
     ROUGE,
     AccGoldLikelihood,
     AvgAtN,
+    BayesAtN,
     BertScore,
     ExactMatches,
     Extractiveness,
@@ -172,6 +174,43 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    bayes_at_n = CorpusLevelMetricGrouping(
+        metric_name=["bayes@n", "bayes@n_sigma"],
+        sample_level_fn=BayesAtN(strip_strings=True),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn={
+            "bayes@n": BayesAtNCorpus("mu"),
+            "bayes@n_sigma": BayesAtNCorpus("sigma"),
+        },
+        higher_is_better={
+            "bayes@n": True,
+            "bayes@n_sigma": False,
+        },
+    )
+    bayes_at_n_math = CorpusLevelMetricGrouping(
+        metric_name=["math-bayes@n", "math-bayes@n_sigma"],
+        sample_level_fn=BayesAtN(
+            name_prefix="math",
+            strip_strings=True,
+            sample_scoring_function=MultilingualExtractiveMatchMetric(
+                language=Language.ENGLISH,
+                fallback_mode="first_match",
+                precision=5,
+                gold_extraction_target=(ExprExtractionConfig(),),
+                pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+                aggregation_function=max,
+            ),
+        ),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn={
+            "math-bayes@n": BayesAtNCorpus("mu"),
+            "math-bayes@n_sigma": BayesAtNCorpus("sigma"),
+        },
+        higher_is_better={
+            "math-bayes@n": True,
+            "math-bayes@n_sigma": False,
+        },
+    )
     bert_score = SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         sample_level_fn=BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip),

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -34,6 +34,7 @@
 import sacrebleu
 import sklearn.metrics
 
+from lighteval.metrics.bayes_at_n import bayes_at_n
 from lighteval.metrics.sample_preparator import (
     GenerativeCorpusMetricInput,
     LogprobCorpusMetricInput,
@@ -62,6 +63,96 @@ def __str__(self):
         return f"{self.__class__.__name__}({', '.join(attr_strs)})"
 
 
+def _is_repeated_full_bayes_prior(non_null_priors: list[object], first_prior: np.ndarray, num_rows: int) -> bool:
+    if not all(np.array_equal(np.asarray(prior), first_prior) for prior in non_null_priors):
+        return False
+    return (first_prior.ndim == 2 and first_prior.shape[0] == num_rows) or (
+        first_prior.ndim == 1 and num_rows == 1
+    )
+
+
+def _coerce_bayes_prior_row(prior: object) -> list[int]:
+    prior_array = np.asarray(prior)
+    if prior_array.ndim == 0:
+        raise ValueError("Bayes@N prior rows must be 1D arrays.")
+    if prior_array.ndim == 2:
+        if prior_array.shape[0] != 1:
+            raise ValueError("Bayes@N row-level prior payloads must contain exactly one row.")
+        prior_array = prior_array.reshape(-1)
+    elif prior_array.ndim != 1:
+        raise ValueError("Bayes@N row-level prior payloads must be 1D arrays.")
+    return prior_array.tolist()
+
+
+def _coerce_bayes_prior(priors: list[object | None], num_rows: int) -> list[list[int]] | object | None:
+    non_null_priors = [prior for prior in priors if prior is not None]
+    if not non_null_priors:
+        return None
+    if len(non_null_priors) != len(priors):
+        raise ValueError("Bayes@N prior observations must be provided for every row or omitted for every row.")
+
+    first_prior = np.asarray(non_null_priors[0])
+    if _is_repeated_full_bayes_prior(non_null_priors, first_prior, num_rows):
+        return non_null_priors[0]
+
+    prior_rows = [_coerce_bayes_prior_row(prior) for prior in non_null_priors]
+    prior_lengths = {len(row) for row in prior_rows}
+    if len(prior_lengths) != 1:
+        raise ValueError("Bayes@N prior rows must all have the same number of observations.")
+    return prior_rows
+
+
+def _coerce_bayes_items(items: list[dict | list[int]]) -> tuple[list[list[int]], list[float] | None, object | None]:
+    if len(items) == 0:
+        raise ValueError("Bayes@N needs at least one row.")
+
+    rows = []
+    weights = None
+    priors = []
+    for item in items:
+        if isinstance(item, dict):
+            if "scores" not in item:
+                raise ValueError("Bayes@N payloads must contain a 'scores' row.")
+            row = item["scores"]
+            item_weights = item.get("weights")
+            priors.append(item.get("prior"))
+        else:
+            row = item
+            item_weights = None
+            priors.append(None)
+
+        row = list(row)
+        if len(row) == 0:
+            raise ValueError("Bayes@N rows must contain at least one score.")
+        rows.append(row)
+
+        if item_weights is not None:
+            item_weights = np.asarray(item_weights, dtype=float)
+            if weights is None:
+                weights = item_weights
+            elif not np.array_equal(weights, item_weights):
+                raise ValueError("Bayes@N received inconsistent weights across rows.")
+
+    row_lengths = {len(row) for row in rows}
+    if len(row_lengths) != 1:
+        raise ValueError("Bayes@N requires every row to have the same number of scores.")
+
+    weights_list = weights.tolist() if weights is not None else None
+    return rows, weights_list, _coerce_bayes_prior(priors, len(rows))
+
+
+class BayesAtNCorpus(CorpusLevelComputation):
+    def __init__(self, statistic: Literal["mu", "sigma"]):
+        if statistic not in {"mu", "sigma"}:
+            raise ValueError("BayesAtNCorpus statistic must be either 'mu' or 'sigma'.")
+        self.statistic = statistic
+
+    def compute_corpus(self, items: list[dict | list[int]]) -> float:
+        rows, weights, prior = _coerce_bayes_items(items)
+        mu, sigma = bayes_at_n(rows, weights=weights, prior=prior)
+        return mu if self.statistic == "mu" else sigma
+
+
 # General aggregations
 class MatthewsCorrCoef(CorpusLevelComputation):
     def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: