Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docs/source/metric-list.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,41 @@ These metrics need the model to generate an output. They are therefore slower.
- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.

### POLLUX (custom metric)

[POLLUX](https://github.com/ai-forever/POLLUX) is a criteria-based LLM-judge suitable for any generative tasks with customizable criteria descriptions.

`PolluxLLMJudgeMetric` is exposed as a **custom** metric class in `lighteval.metrics.metrics_sample` (not registered on the built-in `Metrics` enum), because both the scoring scale and criterion description are defined at initialization time.

Use it by wrapping `PolluxLLMJudgeMetric(...)` into a `SampleLevelMetric`. Minimal setup:

```python
from lighteval.metrics.metrics_sample import PolluxLLMJudgeMetric, SampleLevelMetric

pollux_helpfulness = SampleLevelMetric(
metric_name="pollux_helpfulness",
sample_level_fn=PolluxLLMJudgeMetric(
criteria_name="Helpfulness",
rubrics={
0: "Not helpful: misses the user request or gives incorrect guidance.",
1: "Partially helpful: addresses the request but misses important details.",
2: "Fully helpful: correct, complete, and directly actionable response.",
},
judge_model_name="ai-forever/pollux-judge-32b",
judge_backend="openai",
url="http://localhost:8000/v1", # OpenAI-compatible endpoint (e.g. vllm serve)
include_feedback=True,
),
batched_compute=True,
)
```

Backend details:
- `judge_backend="openai"` uses an OpenAI-compatible HTTP API endpoint (for example, one exposed by `vllm serve`), so pass `url` during metric initialization.
- By default the judge is expected to return a **plain numeric** score (`score_pattern=None` uses `POLLUX_DEFAULT_SCORE_RE` from `lighteval.metrics.utils.judge_utils`). The metric always outputs `pollux_score`. `feedback_pattern=None` means no feedback text (same default chain as `POLLUX_DEFAULT_FEEDBACK_RE` in `make_pollux_feedback_parser`). With `include_feedback=True`, `pollux_feedback` is filled only if you pass a pattern (e.g. `POLLUX_TAGGED_FEEDBACK_RE` for tagged `[FEEDBACK]...[RESULT]...` output).
- For tagged judge text (7b and 32b models), pass `score_pattern=POLLUX_TAGGED_SCORE_RE` and `feedback_pattern=POLLUX_TAGGED_FEEDBACK_RE` from the same module.
- `pollux_feedback` is text and should not be aggregated with `np.mean`; one should aggregate numeric scores via `pollux_score` (or a custom `corpus_level_fn`).
- For several criteria, instantiate multiple metric objects with different outer names and the corresponding criteria and scale descriptions.

Official judge checkpoints are published in the [POLLUX collection](https://huggingface.co/collections/ai-forever/pollux) (e.g. 7B, 32B, and newer variants). Pass the model **repository id** as `judge_model_name` (same string used by `huggingface-cli download` or vLLM `--model`).
2 changes: 2 additions & 0 deletions docs/source/package_reference/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,5 @@
[[autodoc]] metrics.metrics_sample.JudgeLLMMTBench
### JudgeLLMMixEval
[[autodoc]] metrics.metrics_sample.JudgeLLMMixEval
### PolluxLLMJudgeMetric
[[autodoc]] metrics.metrics_sample.PolluxLLMJudgeMetric
133 changes: 131 additions & 2 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
import inspect
import logging
import os
from re import Pattern
from abc import ABC, abstractmethod
from typing import Callable, Literal, Union
from typing import Callable, Literal, Mapping, Union

import nltk
import numpy as np
Expand All @@ -51,7 +52,13 @@
remove_braces,
remove_braces_and_strip,
)
from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa
from lighteval.metrics.utils.judge_utils import (
get_judge_prompt_pollux,
get_judge_prompt_simpleqa,
make_pollux_feedback_parser,
make_pollux_score_parser,
process_judge_response_simpleqa,
)
from lighteval.metrics.utils.llm_as_judge import JudgeLM
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.requests import Doc
Expand Down Expand Up @@ -1043,6 +1050,128 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) ->
return metrics


class PolluxLLMJudgeMetric(SampleLevelComputation):
"""POLLUX rubric judge as a sample-level metric (uses :class:`~lighteval.metrics.utils.llm_as_judge.JudgeLM`).

This class does not subclass :class:`JudgeLLM`, so arbitrary judge model names are allowed
for the ``openai`` backend (no OpenAI model whitelist).

Use with :class:`~lighteval.metrics.utils.metric_utils.SampleLevelMetric` and
``batched_compute=True``. For several criteria, add multiple metrics with different outer
``metric_name`` values so result columns do not collide; each instance sets ``criteria_name``
and ``rubrics`` in ``__init__`` only.

Data mapping:

- Question/instruction: ``doc.query``
- Answer: ``response.final_text[0]``
- Optional reference: ``doc.specific["reference_answer"]`` (if present) as POLLUX gold
- ``options``: not used (always ``None`` in the judge batch)

Returns per sample at least ``pollux_score``. If ``include_feedback=True``, also adds
``pollux_feedback`` using ``feedback_pattern`` (default: empty; use
``POLLUX_TAGGED_FEEDBACK_RE`` for ``[FEEDBACK]...[RESULT]``). Feedback is not
aggregatable at corpus level; use a custom ``corpus_level_fn`` that only averages
``pollux_score`` (e.g. ``lambda rows: np.mean([r["pollux_score"] for r in rows])``)
when you enable feedback.

Args:
criteria_name: Criterion title passed to the POLLUX template.
rubrics: Rubric / scale description for that criterion as a mapping
``score -> description`` (for example ``{0: "bad", 1: "ok", 2: "good"}``).
judge_model_name: Model id for the backend (e.g. Hugging Face repo id for POLLUX judges:
``ai-forever/Pollux-4B-Judge``, ``ai-forever/pollux-judge-7b``, ``ai-forever/pollux-judge-32b``, or ``-r`` variants—see
the POLLUX collection on the Hub).
judge_backend: ``JudgeLM`` backend; default ``openai`` for OpenAI-compatible HTTP APIs
(e.g. ``vllm serve --api-key ...`` with ``OPENAI_BASE_URL`` / ``base_url``).
url: Optional API base URL (local or hosted OpenAI-compatible endpoint).
api_key: Optional API key (local servers often work without one).
max_tokens: Max new tokens for the judge completion.
backend_options: Optional backend-specific options (e.g. LiteLLM).
hf_provider: Required when ``judge_backend`` is ``inference-providers``.
response_format: Optional OpenAI ``response_format`` (default is plain text).
include_feedback: If True, include ``pollux_feedback`` in each sample dict (strings are
not corpus-aggregated by default ``np.mean``).
score_pattern: Optional regex for score (default: bare numeric response).
feedback_pattern: Optional regex for feedback (default: no feedback unless
``include_feedback`` with a non-``None`` pattern).
"""

def __init__(
self,
criteria_name: str,
rubrics: Mapping[int | str, str],
judge_model_name: str,
judge_backend: Literal["litellm", "openai", "transformers", "vllm", "tgi", "inference-providers"] = "openai",
url: str | None = None,
api_key: str | None = None,
max_tokens: int | None = 512,
backend_options: dict | None = None,
hf_provider: str | None = None,
response_format: BaseModel | None = None,
include_feedback: bool = False,
score_pattern: Pattern[str] | None = None,
feedback_pattern: Pattern[str] | None = None,
) -> None:
self.criteria_name = criteria_name
self.rubrics = self._normalize_rubrics(rubrics)
self.include_feedback = include_feedback
self._feedback_parser = make_pollux_feedback_parser(feedback_pattern)
self.judge = JudgeLM(
model=judge_model_name,
templates=get_judge_prompt_pollux,
process_judge_response=make_pollux_score_parser(score_pattern),
judge_backend=judge_backend,
url=url,
api_key=api_key,
max_tokens=max_tokens,
backend_options=backend_options,
hf_provider=hf_provider,
response_format=response_format,
)

@staticmethod
def _normalize_rubrics(rubrics: Mapping[int | str, str]) -> str:
if isinstance(rubrics, Mapping):
try:
items = sorted(rubrics.items(), key=lambda item: int(item[0]))
except (TypeError, ValueError):
items = sorted(rubrics.items(), key=lambda item: str(item[0]))
return "\n".join(f"{k}: {v}" for k, v in items)
raise TypeError("rubrics must be a mapping score->description")

def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list[dict]:
n = len(docs)
if len(responses) != n:
raise ValueError("responses and docs must have the same length")
questions = [d.query for d in docs]
predictions = [r.final_text[0] for r in responses]
options = [None] * n
golds: list[str | None] = [] #optional reference answer
for d in docs:
ref = None
if d.specific:
raw = d.specific.get("reference_answer")
if raw is not None:
ref = str(raw).strip() or None
golds.append(ref)
scores, _prompts, judgements = self.judge.evaluate_answer_batch(
questions,
predictions,
options,
golds,
criteria_name=[self.criteria_name] * n,
rubrics=[self.rubrics] * n,
)
out: list[dict] = []
for i in range(n):
row: dict = {"pollux_score": scores[i]}
if self.include_feedback:
row["pollux_feedback"] = self._feedback_parser(judgements[i])
out.append(row)
return out


class JudgeLLMMTBench(JudgeLLM):
def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs):
"""Compute the score of a generative task using a llm as a judge.
Expand Down
116 changes: 116 additions & 0 deletions src/lighteval/metrics/utils/judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,22 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
import re


logger = logging.getLogger(__name__)

POLLUX_DEFAULT_SCORE_RE = re.compile(r"^\s*(\d+(?:[.,]\d+)?)\s*$")

POLLUX_DEFAULT_FEEDBACK_RE: re.Pattern[str] | None = None

POLLUX_TAGGED_SCORE_RE = re.compile(
r"\[RESULT\]\s*([^\s\[]+)\s*\[END\]", re.IGNORECASE | re.DOTALL
)
POLLUX_TAGGED_FEEDBACK_RE = re.compile(
r"\[FEEDBACK\](.*?)\[RESULT\]", re.IGNORECASE | re.DOTALL
)


def get_judge_prompt_simpleqa(question: str, answer: str, gold: str, **kwargs):
GRADER_TEMPLATE = """
Expand Down Expand Up @@ -125,3 +137,107 @@ def process_judge_response_simpleqa(response: str) -> float:
else:
logger.warning(f"Unknown response from judge: {response}")
return 0.0


def _build_pollux_prompt_text(
instruction: str,
answer: str,
criteria_name: str,
rubrics: str,
reference_answer: str | None = None,
) -> str:
"""Format the POLLUX judge user message."""
sections = [
"### Задание для оценки:\n" + instruction,
]
if reference_answer is not None and reference_answer.strip():
sections.append("### Эталонный ответ:\n" + reference_answer)
sections.extend(
[
"### Ответ для оценки:\n" + answer,
"### Критерий оценки:\n" + criteria_name,
"### Шкала оценивания по критерию:\n" + rubrics,
]
)
return "\n\n".join(sections)


def get_judge_prompt_pollux(
question: str,
answer: str,
options: list[str] | None = None, # noqa: ARG001, left for compatibility with JudgeLM implementation
gold: str | None = None,
criteria_name: str = "",
rubrics: str = "",
):
"""Build chat messages for the POLLUX judge (OpenAI-style ``messages`` list).

Args:
question: Task instruction / question (maps to POLLUX ``instruction``).
answer: Model answer to score.
options: Ignored (POLLUX does not use multiple-choice options).
gold: Optional reference answer.
criteria_name: Criterion name and description.
rubrics: Evaluation scale definitions for the criterion (normalized to a string format, see `metric_utils.normalize_rubrics`).

Returns:
A one-turn chat messages list ``[{"role": "user", "content": ...}]``.
"""
body = _build_pollux_prompt_text(
instruction=question,
answer=answer,
criteria_name=criteria_name,
rubrics=rubrics,
reference_answer=gold if gold else None,
)
return [{"role": "user", "content": body}]


def make_pollux_score_parser(pattern=None):
"""Build a callable that parses POLLUX judge output to a float score.

``pattern`` defaults to :data:`POLLUX_DEFAULT_SCORE_RE` (bare numeric response).
Use :data:`POLLUX_TAGGED_SCORE_RE` for ``[RESULT] <score> [END]`` output.
"""
effective = pattern if pattern is not None else POLLUX_DEFAULT_SCORE_RE

def _parse(response: str | object) -> float:
text = response if isinstance(response, str) else str(response)
if not text:
return 0.0
match = effective.search(text)
if not match:
logger.warning("POLLUX judge response could not be parsed for score; returning 0.0")
return 0.0
raw = match.group(1).strip().replace(",", ".")
try:
return float(raw)
except ValueError:
logger.warning(f"POLLUX judge score not numeric: {raw!r}")
return 0.0

return _parse


def make_pollux_feedback_parser(pattern=None):
"""Build a callable that extracts feedback from POLLUX judge text.

``pattern`` defaults to :data:`POLLUX_DEFAULT_FEEDBACK_RE` (no feedback text).
Use :data:`POLLUX_TAGGED_FEEDBACK_RE` for ``[FEEDBACK]...[RESULT]`` output.
"""
effective = pattern if pattern is not None else POLLUX_DEFAULT_FEEDBACK_RE

def _parse(response: str | object) -> str:
if effective is None:
return ""
text = response if isinstance(response, str) else str(response)
if not text:
return ""
match = effective.search(text)
return match.group(1).strip() if match else ""

return _parse


process_judge_response_pollux = make_pollux_score_parser()
parse_pollux_feedback = make_pollux_feedback_parser()
11 changes: 8 additions & 3 deletions src/lighteval/metrics/utils/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import asyncio
import logging
import os
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
Expand Down Expand Up @@ -155,9 +156,13 @@ def __lazy_load_client(self): # noqa: C901
if self.client is None:
from openai import OpenAI

self.client = OpenAI(
api_key=self.api_key if self.url is None else None, base_url=self.url if self.url else None
)
base_url = self.url if self.url else None
# Custom base_url: OpenAI SDK requires an explicit api_key (use env or "" for local servers).
if base_url is not None:
api_key = self.api_key if self.api_key is not None else os.getenv("OPENAI_API_KEY") or ""
else:
api_key = self.api_key
self.client = OpenAI(api_key=api_key, base_url=base_url)
return self.__call_api_parallel

case "litellm":
Expand Down
Loading