huggingface · ulyanaisaeva · Apr 10, 2026
diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
@@ -61,3 +61,41 @@ These metrics need the model to generate an output. They are therefore slower.
 - `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
 - `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
 - `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
+
+### POLLUX (custom metric)
+
+[POLLUX](https://github.com/ai-forever/POLLUX) is a criteria-based LLM-judge suitable for any generative tasks with customizable criteria descriptions.
+
+`PolluxLLMJudgeMetric` is exposed as a **custom** metric class in `lighteval.metrics.metrics_sample` (not registered on the built-in `Metrics` enum), because both the scoring scale and criterion description are defined at initialization time.
+
+Use it by wrapping `PolluxLLMJudgeMetric(...)` into a `SampleLevelMetric`. Minimal setup:
+
+```python
+from lighteval.metrics.metrics_sample import PolluxLLMJudgeMetric, SampleLevelMetric
+
+pollux_helpfulness = SampleLevelMetric(
+    metric_name="pollux_helpfulness",
+    sample_level_fn=PolluxLLMJudgeMetric(
+        criteria_name="Helpfulness",
+        rubrics={
+            0: "Not helpful: misses the user request or gives incorrect guidance.",
+            1: "Partially helpful: addresses the request but misses important details.",
+            2: "Fully helpful: correct, complete, and directly actionable response.",
+        },
+        judge_model_name="ai-forever/pollux-judge-32b",
+        judge_backend="openai",
+        url="http://localhost:8000/v1",  # OpenAI-compatible endpoint (e.g. vllm serve)
+        include_feedback=True,
+    ),
+    batched_compute=True,
+)
+```
+
+Backend details:
+- `judge_backend="openai"` uses an OpenAI-compatible HTTP API endpoint (for example, one exposed by `vllm serve`), so pass `url` during metric initialization.
+- By default the judge is expected to return a **plain numeric** score (`score_pattern=None` uses `POLLUX_DEFAULT_SCORE_RE` from `lighteval.metrics.utils.judge_utils`). The metric always outputs `pollux_score`. `feedback_pattern=None` means no feedback text (same default chain as `POLLUX_DEFAULT_FEEDBACK_RE` in `make_pollux_feedback_parser`). With `include_feedback=True`, `pollux_feedback` is filled only if you pass a pattern (e.g. `POLLUX_TAGGED_FEEDBACK_RE` for tagged `[FEEDBACK]...[RESULT]...` output).
+- For tagged judge text (7b and 32b models), pass `score_pattern=POLLUX_TAGGED_SCORE_RE` and `feedback_pattern=POLLUX_TAGGED_FEEDBACK_RE` from the same module.
+- `pollux_feedback` is text and should not be aggregated with `np.mean`; one should aggregate numeric scores via `pollux_score` (or a custom `corpus_level_fn`).
+- For several criteria, instantiate multiple metric objects with different outer names and the corresponding criteria and scale descriptions.
+
+Official judge checkpoints are published in the [POLLUX collection](https://huggingface.co/collections/ai-forever/pollux) (e.g. 7B, 32B, and newer variants). Pass the model **repository id** as `judge_model_name` (same string used by `huggingface-cli download` or vLLM `--model`).
diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx
@@ -74,3 +74,5 @@
 [[autodoc]] metrics.metrics_sample.JudgeLLMMTBench
 ### JudgeLLMMixEval
 [[autodoc]] metrics.metrics_sample.JudgeLLMMixEval
+### PolluxLLMJudgeMetric
+[[autodoc]] metrics.metrics_sample.PolluxLLMJudgeMetric
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -27,8 +27,9 @@
 import inspect
 import logging
 import os
+from re import Pattern
 from abc import ABC, abstractmethod
-from typing import Callable, Literal, Union
+from typing import Callable, Literal, Mapping, Union
 
 import nltk
 import numpy as np
@@ -51,7 +52,13 @@
     remove_braces,
     remove_braces_and_strip,
 )
-from lighteval.metrics.utils.judge_utils import get_judge_prompt_simpleqa, process_judge_response_simpleqa
+from lighteval.metrics.utils.judge_utils import (
+    get_judge_prompt_pollux,
+    get_judge_prompt_simpleqa,
+    make_pollux_feedback_parser,
+    make_pollux_score_parser,
+    process_judge_response_simpleqa,
+)
 from lighteval.metrics.utils.llm_as_judge import JudgeLM
 from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.requests import Doc
@@ -1043,6 +1050,128 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) ->
         return metrics
 
 
+class PolluxLLMJudgeMetric(SampleLevelComputation):
+    """POLLUX rubric judge as a sample-level metric (uses :class:`~lighteval.metrics.utils.llm_as_judge.JudgeLM`).
+
+    This class does not subclass :class:`JudgeLLM`, so arbitrary judge model names are allowed
+    for the ``openai`` backend (no OpenAI model whitelist).
+
+    Use with :class:`~lighteval.metrics.utils.metric_utils.SampleLevelMetric` and
+    ``batched_compute=True``. For several criteria, add multiple metrics with different outer
+    ``metric_name`` values so result columns do not collide; each instance sets ``criteria_name``
+    and ``rubrics`` in ``__init__`` only.
+
+    Data mapping:
+
+    - Question/instruction: ``doc.query``
+    - Answer: ``response.final_text[0]``
+    - Optional reference: ``doc.specific["reference_answer"]`` (if present) as POLLUX gold
+    - ``options``: not used (always ``None`` in the judge batch)
+
+    Returns per sample at least ``pollux_score``. If ``include_feedback=True``, also adds
+    ``pollux_feedback`` using ``feedback_pattern`` (default: empty; use
+    ``POLLUX_TAGGED_FEEDBACK_RE`` for ``[FEEDBACK]...[RESULT]``). Feedback is not
+    aggregatable at corpus level; use a custom ``corpus_level_fn`` that only averages
+    ``pollux_score`` (e.g. ``lambda rows: np.mean([r["pollux_score"] for r in rows])``)
+    when you enable feedback.
+
+    Args:
+        criteria_name: Criterion title passed to the POLLUX template.
+        rubrics: Rubric / scale description for that criterion as a mapping
+            ``score -> description`` (for example ``{0: "bad", 1: "ok", 2: "good"}``).
+        judge_model_name: Model id for the backend (e.g. Hugging Face repo id for POLLUX judges:
+            ``ai-forever/Pollux-4B-Judge``, ``ai-forever/pollux-judge-7b``, ``ai-forever/pollux-judge-32b``, or ``-r`` variants—see
+            the POLLUX collection on the Hub).
+        judge_backend: ``JudgeLM`` backend; default ``openai`` for OpenAI-compatible HTTP APIs
+            (e.g. ``vllm serve --api-key ...`` with ``OPENAI_BASE_URL`` / ``base_url``).
+        url: Optional API base URL (local or hosted OpenAI-compatible endpoint).
+        api_key: Optional API key (local servers often work without one).
+        max_tokens: Max new tokens for the judge completion.
+        backend_options: Optional backend-specific options (e.g. LiteLLM).
+        hf_provider: Required when ``judge_backend`` is ``inference-providers``.
+        response_format: Optional OpenAI ``response_format`` (default is plain text).
+        include_feedback: If True, include ``pollux_feedback`` in each sample dict (strings are
+            not corpus-aggregated by default ``np.mean``).
+        score_pattern: Optional regex for score (default: bare numeric response).
+        feedback_pattern: Optional regex for feedback (default: no feedback unless
+            ``include_feedback`` with a non-``None`` pattern).
+    """
+
+    def __init__(
+        self,
+        criteria_name: str,
+        rubrics: Mapping[int | str, str],
+        judge_model_name: str,
+        judge_backend: Literal["litellm", "openai", "transformers", "vllm", "tgi", "inference-providers"] = "openai",
+        url: str | None = None,
+        api_key: str | None = None,
+        max_tokens: int | None = 512,
+        backend_options: dict | None = None,
+        hf_provider: str | None = None,
+        response_format: BaseModel | None = None,
+        include_feedback: bool = False,
+        score_pattern: Pattern[str] | None = None,
+        feedback_pattern: Pattern[str] | None = None,
+    ) -> None:
+        self.criteria_name = criteria_name
+        self.rubrics = self._normalize_rubrics(rubrics)
+        self.include_feedback = include_feedback
+        self._feedback_parser = make_pollux_feedback_parser(feedback_pattern)
+        self.judge = JudgeLM(
+            model=judge_model_name,
+            templates=get_judge_prompt_pollux,
+            process_judge_response=make_pollux_score_parser(score_pattern),
+            judge_backend=judge_backend,
+            url=url,
+            api_key=api_key,
+            max_tokens=max_tokens,
+            backend_options=backend_options,
+            hf_provider=hf_provider,
+            response_format=response_format,
+        )
+
+    @staticmethod
+    def _normalize_rubrics(rubrics: Mapping[int | str, str]) -> str:
+        if isinstance(rubrics, Mapping):
+            try:
+                items = sorted(rubrics.items(), key=lambda item: int(item[0]))
+            except (TypeError, ValueError):
+                items = sorted(rubrics.items(), key=lambda item: str(item[0]))
+            return "\n".join(f"{k}: {v}" for k, v in items)
+        raise TypeError("rubrics must be a mapping score->description")
+
+    def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) -> list[dict]:
+        n = len(docs)
+        if len(responses) != n:
+            raise ValueError("responses and docs must have the same length")
+        questions = [d.query for d in docs]
+        predictions = [r.final_text[0] for r in responses]
+        options = [None] * n
+        golds: list[str | None] = []  #optional reference answer
+        for d in docs:
+            ref = None
+            if d.specific:
+                raw = d.specific.get("reference_answer")
+                if raw is not None:
+                    ref = str(raw).strip() or None
+            golds.append(ref)
+        scores, _prompts, judgements = self.judge.evaluate_answer_batch(
+            questions,
+            predictions,
+            options,
+            golds,
+            criteria_name=[self.criteria_name] * n,
+            rubrics=[self.rubrics] * n,
+        )
+        out: list[dict] = []
+        for i in range(n):
+            row: dict = {"pollux_score": scores[i]}
+            if self.include_feedback:
+                row["pollux_feedback"] = self._feedback_parser(judgements[i])
+            out.append(row)
+        return out
+
+
 class JudgeLLMMTBench(JudgeLLM):
     def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs):
         """Compute the score of a generative task using a llm as a judge.

diff --git a/src/lighteval/metrics/utils/judge_utils.py b/src/lighteval/metrics/utils/judge_utils.py
@@ -20,10 +20,22 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import logging
+import re
 
 
 logger = logging.getLogger(__name__)
 
+POLLUX_DEFAULT_SCORE_RE = re.compile(r"^\s*(\d+(?:[.,]\d+)?)\s*$")
+
+POLLUX_DEFAULT_FEEDBACK_RE: re.Pattern[str] | None = None
+
+POLLUX_TAGGED_SCORE_RE = re.compile(
+    r"\[RESULT\]\s*([^\s\[]+)\s*\[END\]", re.IGNORECASE | re.DOTALL
+)
+POLLUX_TAGGED_FEEDBACK_RE = re.compile(
+    r"\[FEEDBACK\](.*?)\[RESULT\]", re.IGNORECASE | re.DOTALL
+)
+
 
 def get_judge_prompt_simpleqa(question: str, answer: str, gold: str, **kwargs):
     GRADER_TEMPLATE = """
@@ -125,3 +137,107 @@ def process_judge_response_simpleqa(response: str) -> float:
     else:
         logger.warning(f"Unknown response from judge: {response}")
         return 0.0
+
+
+def _build_pollux_prompt_text(
+    instruction: str,
+    answer: str,
+    criteria_name: str,
+    rubrics: str,
+    reference_answer: str | None = None,
+) -> str:
+    """Format the POLLUX judge user message."""
+    sections = [
+        "### Задание для оценки:\n" + instruction,
+    ]
+    if reference_answer is not None and reference_answer.strip():
+        sections.append("### Эталонный ответ:\n" + reference_answer)
+    sections.extend(
+        [
+            "### Ответ для оценки:\n" + answer,
+            "### Критерий оценки:\n" + criteria_name,
+            "### Шкала оценивания по критерию:\n" + rubrics,
+        ]
+    )
+    return "\n\n".join(sections)
+
+
+def get_judge_prompt_pollux(
+    question: str,
+    answer: str,
+    options: list[str] | None = None,  # noqa: ARG001, left for compatibility with JudgeLM implementation
+    gold: str | None = None,
+    criteria_name: str = "",
+    rubrics: str = "",
+):
+    """Build chat messages for the POLLUX judge (OpenAI-style ``messages`` list).
+
+    Args:
+        question: Task instruction / question (maps to POLLUX ``instruction``).
+        answer: Model answer to score.
+        options: Ignored (POLLUX does not use multiple-choice options).
+        gold: Optional reference answer.
+        criteria_name: Criterion name and description.
+        rubrics: Evaluation scale definitions for the criterion (normalized to a string format, see `metric_utils.normalize_rubrics`).
+
+    Returns:
+        A one-turn chat messages list ``[{"role": "user", "content": ...}]``.
+    """
+    body = _build_pollux_prompt_text(
+        instruction=question,
+        answer=answer,
+        criteria_name=criteria_name,
+        rubrics=rubrics,
+        reference_answer=gold if gold else None,
+    )
+    return [{"role": "user", "content": body}]
+
+
+def make_pollux_score_parser(pattern=None):
+    """Build a callable that parses POLLUX judge output to a float score.
+
+    ``pattern`` defaults to :data:`POLLUX_DEFAULT_SCORE_RE` (bare numeric response).
+    Use :data:`POLLUX_TAGGED_SCORE_RE` for ``[RESULT] <score> [END]`` output.
+    """
+    effective = pattern if pattern is not None else POLLUX_DEFAULT_SCORE_RE
+
+    def _parse(response: str | object) -> float:
+        text = response if isinstance(response, str) else str(response)
+        if not text:
+            return 0.0
+        match = effective.search(text)
+        if not match:
+            logger.warning("POLLUX judge response could not be parsed for score; returning 0.0")
+            return 0.0
+        raw = match.group(1).strip().replace(",", ".")
+        try:
+            return float(raw)
+        except ValueError:
+            logger.warning(f"POLLUX judge score not numeric: {raw!r}")
+            return 0.0
+
+    return _parse
+
+
+def make_pollux_feedback_parser(pattern=None):
+    """Build a callable that extracts feedback from POLLUX judge text.
+
+    ``pattern`` defaults to :data:`POLLUX_DEFAULT_FEEDBACK_RE` (no feedback text).
+    Use :data:`POLLUX_TAGGED_FEEDBACK_RE` for ``[FEEDBACK]...[RESULT]`` output.
+    """
+    effective = pattern if pattern is not None else POLLUX_DEFAULT_FEEDBACK_RE
+
+    def _parse(response: str | object) -> str:
+        if effective is None:
+            return ""
+        text = response if isinstance(response, str) else str(response)
+        if not text:
+            return ""
+        match = effective.search(text)
+        return match.group(1).strip() if match else ""
+
+    return _parse
+
+
+process_judge_response_pollux = make_pollux_score_parser()
+parse_pollux_feedback = make_pollux_feedback_parser()
diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py
@@ -23,6 +23,7 @@
 
 import asyncio
 import logging
+import os
 import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -155,9 +156,13 @@ def __lazy_load_client(self):  # noqa: C901
                 if self.client is None:
                     from openai import OpenAI
 
-                    self.client = OpenAI(
-                        api_key=self.api_key if self.url is None else None, base_url=self.url if self.url else None
-                    )
+                    base_url = self.url if self.url else None
+                    # Custom base_url: OpenAI SDK requires an explicit api_key (use env or "" for local servers).
+                    if base_url is not None:
+                        api_key = self.api_key if self.api_key is not None else os.getenv("OPENAI_API_KEY") or ""
+                    else:
+                        api_key = self.api_key
+                    self.client = OpenAI(api_key=api_key, base_url=base_url)
                 return self.__call_api_parallel
 
             case "litellm":