From 55de7e235b4d5f8bb1003574a300b3174f9c548c Mon Sep 17 00:00:00 2001
From: ColorfulData <125739631+ColorfulData@users.noreply.github.com>
Date: Fri, 10 Apr 2026 08:06:21 +0530
Subject: [PATCH] fix(vllm): use max prompt length for batch context-length
 check

context_size was computed as len(inputs[0]), checking only the first
prompt in the batch. Any prompt longer than the first would bypass
truncation, causing vLLM to receive sequences exceeding max_model_len.
Fixes #1204.
---
 src/lighteval/models/vllm/vllm_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 3100c56b7..cda7ee335 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -355,7 +355,7 @@ def _greedy_until(
             # The choice we go for here is to avoid truncating the prompt if we can, since it
             # should have been managed by the prompt creator/few shot manager if requested by the user.
             inputs = tokenized["input_ids"]
-            context_size = len(inputs[0])
+            context_size = max(len(inp) for inp in inputs)
 
             # left truncate the inputs to the maximum length
             if self.max_length is None:
@@ -365,7 +365,7 @@ def _greedy_until(
             elif max_new_tokens is not None:
                 if context_size + max_new_tokens > self.max_length:
                     logger.warning(
-                        f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+                        f"Batch max length {context_size} + {max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
                     )
                     context_size = self.max_length - max_new_tokens
                     if context_size < 0:
@@ -377,7 +377,7 @@ def _greedy_until(
             else:
                 if context_size > self.max_length:
                     logger.warning(
-                        f"{context_size=} which is greater than {self.max_length=}. Truncating context to {self.max_length} tokens."
+                        f"Batch max length {context_size=} which is greater than {self.max_length=}. Truncating context to {self.max_length} tokens."
                     )
                     context_size = self.max_length
                     inputs = [input[-context_size:] for input in inputs]