diff --git a/examples/bench.py b/examples/bench.py index 453c64cd..1d00f326 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -1,10 +1,10 @@ import infinicore -from transformers import AutoTokenizer -from infinilm.modeling_utils import load_model_state_dict_by_file -from infinilm.distributed import DistConfig -from infinilm.infer_engine import GenerationConfig, InferEngine from infinilm.base_config import BaseConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig +from infinilm.distributed import DistConfig +from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.modeling_utils import load_model_state_dict_by_file +from infinilm.tokenizer_utils import InfiniLMTokenizer import argparse import sys import time @@ -42,9 +42,9 @@ def get_test_cases( input_len_list: list[int], output_len_list: list[int], ): + """Generate cases ordered by ascending KV cache memory usage.""" model_path = os.path.expanduser(model_path) - """Generate cases ordered by ascending KV cache memory usage.""" # Load model config to derive attention dimensions config = read_json_file(os.path.join(model_path, "config.json")) head_dim = config.get( @@ -92,19 +92,23 @@ def get_test_cases( return case_dict +# Load benchmark prompt from file with open("examples/bench_prompt.md", "r") as f: prompt = f.read() def repeat_prompt(input_ids: list[int], target_length: int): + """Repeat or truncate input_ids to match target_length.""" num = len(input_ids) repeat_times = (target_length + num - 1) // num return (input_ids * repeat_times)[:target_length] class TestModel: + """Benchmark model wrapper for performance testing.""" + model: infinicore.nn.Module - tokenizer: AutoTokenizer + tokenizer: InfiniLMTokenizer input_ids_list: list[int] def __init__( @@ -118,8 +122,9 @@ def __init__( attn_backend="default", ) -> None: model_path = os.path.expanduser(model_path) + # ---------------------------------------------------------------------------- # - # 创建模型, + # Create Model # ---------------------------------------------------------------------------- # model = InferEngine( model_path, @@ -132,47 +137,30 @@ def __init__( ) # ---------------------------------------------------------------------------- # - # 加载权重 + # Load Weights # ---------------------------------------------------------------------------- # if not skip_load: load_model_state_dict_by_file(model, model_path, dtype=model.dtype) # ---------------------------------------------------------------------------- # - # 创建 tokenizer + # Initialize Tokenizer # ---------------------------------------------------------------------------- # - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - if tokenizer.pad_token is None: - if tokenizer.eos_token is not None: - tokenizer.pad_token = tokenizer.eos_token - tokenizer.pad_token_id = tokenizer.eos_token_id - else: - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + self.tokenizer = InfiniLMTokenizer(model_path) # ---------------------------------------------------------------------------- # - # token编码 + # Encode Prompt # ---------------------------------------------------------------------------- # input_content = [ - tokenizer.apply_chat_template( + self.tokenizer.apply_chat_template( conversation=[{"role": "user", "content": prompt}], add_generation_prompt=True, tokenize=False, ) ] - # print(input_content, end="", flush=True) - # Support Transformers >= 5.0 for batch_encode_plus deprecation - encoding = tokenizer( - input_content, - padding=True, - truncation=True, - max_length=8192, - ) - - input_ids_list = encoding["input_ids"] + input_ids_list = self.tokenizer.encode(input_content) self.model = model - self.tokenizer = tokenizer self.input_ids_list = input_ids_list def run( @@ -184,11 +172,12 @@ def run( top_p=1.0, temperature=1.0, ): + """Run a single benchmark test case.""" input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len) input_ids_list = [input_ids] * batch_size # ---------------------------------------------------------------------------- # - # 自回归生成 + # Autoregressive Generation # ---------------------------------------------------------------------------- # input_ids_infini = infinicore.from_list(input_ids_list) @@ -211,6 +200,7 @@ def run( numpy_output_ids = np.array( [output_id.to_numpy()[0] for output_id in output_ids] ) + # Use InfiniLMTokenizer for decoding print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True)) print( @@ -224,8 +214,9 @@ def run( device_str = cfg.get_device_str(cfg.device) _PAGED_KV_BLOCK_SIZE = cfg.block_size + # -------------------------------------------------------- # - # 解析参数 + # Parse Arguments # -------------------------------------------------------- # model_path = cfg.model @@ -252,8 +243,9 @@ def run( output_len = [output_len] cases_dict = get_test_cases(model_path, batch_size, input_len, output_len) + # -------------------------------------------------------- # - # 测试 + # Initialize Test Configuration # -------------------------------------------------------- # if enable_paged_attn: paged_kv_block_size = _PAGED_KV_BLOCK_SIZE @@ -290,7 +282,7 @@ def run( if cfg.warmup: warmup_steps = 1 - # warmup cache capacity + # Warmup cache capacity warmup_cache_len = 128 warmup_batch = len(test.input_ids_list) @@ -316,7 +308,7 @@ def run( _ = test.model.generate( input_ids_infini, GenerationConfig( - max_new_tokens=5, # decode kernel warmup + max_new_tokens=5, # Decode kernel warmup temperature=cfg.temperature, top_k=cfg.top_k, top_p=cfg.top_p, @@ -327,12 +319,12 @@ def run( print("=================== warmup done ====================") - # reset cache back to benchmark config + # Reset cache back to benchmark config if cache_config is not None: test.model.reset_cache(cache_config) # ---------------------------------------------------------------------------- # - # Warmup done + # Run Benchmarks # ---------------------------------------------------------------------------- # for idx, case in tqdm(cases_dict.items(), desc="Processing cases"): @@ -343,7 +335,7 @@ def run( output_len = case["output_len"] if not enable_paged_attn: - # reset cache if static kvcache is used + # Reset cache if static KV cache is used initial_capacity = input_len + output_len test.model.reset_cache( StaticKVCacheConfig( @@ -351,7 +343,7 @@ def run( ) ) - # run test one case + # Run test for one case test.run( batch_size=batch_size, input_len=input_len, diff --git a/examples/test_infer.py b/examples/test_infer.py index abec5d00..a3551efc 100644 --- a/examples/test_infer.py +++ b/examples/test_infer.py @@ -1,18 +1,16 @@ import infinicore -import transformers -from transformers import AutoTokenizer -from tokenizers import decoders as _dec -from infinilm.modeling_utils import load_model_state_dict_by_file -from infinilm.distributed import DistConfig -from infinilm.infer_engine import GenerationConfig, InferEngine import argparse import sys import time import os import numpy as np from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig -from packaging import version + from infinilm.base_config import BaseConfig +from infinilm.distributed import DistConfig +from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.modeling_utils import load_model_state_dict_by_file +from infinilm.tokenizer_utils import InfiniLMTokenizer from PIL import Image import torch @@ -37,6 +35,7 @@ def test( image_path=None, ): model_path = os.path.expanduser(model_path) + # ---------------------------------------------------------------------------- # # Create Model # ---------------------------------------------------------------------------- # @@ -51,16 +50,18 @@ def test( attention_backend=attn_backend, kv_cache_dtype=cfg.kv_cache_dtype, ) + # ---------------------------------------------------------------------------- # # Load Weights # ---------------------------------------------------------------------------- # load_model_state_dict_by_file(model, model_path, dtype=model.dtype) # ---------------------------------------------------------------------------- # - # create tokenizer + # Initialize Tokenizer # ---------------------------------------------------------------------------- # - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = InfiniLMTokenizer(model_path) + # Initialize processor for multimodal models processor = None if image_path is not None: if model.model_type == "minicpmv": @@ -69,33 +70,14 @@ def test( processor = AutoProcessor.from_pretrained( model_path, trust_remote_code=True ) - tokenizer = processor.tokenizer - - if "llama" == model.model_type: - backend = getattr(tokenizer, "backend_tokenizer", None) - target = getattr(backend, "_tokenizer", backend) - norm = getattr(target, "normalizer", None) - dec = getattr(target, "decoder", None) - sn = repr(norm)[:800] if norm is not None else "" - sd = repr(dec)[:800] if dec is not None else "" - has_prepend = "Prepend" in sn - has_strip = "Strip" in sd - if has_prepend and has_strip: - target.decoder = _dec.Sequence( - [ - _dec.Replace("▁", " "), - _dec.ByteFallback(), - _dec.Fuse(), - ] - ) # ---------------------------------------------------------------------------- # - # tokenize + # Tokenize Inputs # ---------------------------------------------------------------------------- # - # prompt = "山东最高的山是?" if isinstance(prompts, str): prompts = [prompts] + # Handle image prompts for multimodal models if image_path is not None: updated_prompts = [] for prompt in prompts: @@ -104,18 +86,17 @@ def test( updated_prompts.append(prompt) prompts = updated_prompts - if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None: - input_contents = [ - tokenizer.apply_chat_template( - conversation=[{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - for prompt in prompts - ] - else: - input_contents = prompts + # Apply chat template or use raw prompts + input_contents = [ + tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + for prompt in prompts + ] + # Process multimodal inputs or encode text pixel_values = None image_bound = None tgt_sizes = None @@ -139,39 +120,14 @@ def test( else: raise ValueError(f"Unsupported multimodal model_type: {model.model_type}") else: - if hasattr(tokenizer, "batch_encode_plus"): - input_ids_list = tokenizer.batch_encode_plus(input_contents)["input_ids"] - elif hasattr(tokenizer, "_encode_plus"): - input_ids_list = tokenizer._encode_plus(input_contents)["input_ids"] - else: - input_ids_list = tokenizer(input_contents)[ - "input_ids" - ] # List: [[1, 1128, 526, 366, 29892]] - - # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ - # "input_ids" - # ] # List: [[1, 1128, 526, 366, 29892]] - if version.parse(transformers.__version__) < version.parse("5.0.0"): - # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used. - input_ids_list = [ - tokenizer.encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] - else: - input_ids_list = [ - tokenizer._encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] + # Use InfiniLMTokenizer for encoding + input_ids_list = tokenizer.encode(input_contents) # ---------------------------------------------------------------------------- # - # Create KVCache + # Create KV Cache # ---------------------------------------------------------------------------- # if enable_paged_attn: - batch_size = 1 if prompts is str else len(prompts) + batch_size = 1 if isinstance(prompts, str) else len(prompts) max_total_tokens = max_new_tokens + len(input_ids_list[0]) cache_config = PagedKVCacheConfig( num_blocks=( @@ -181,7 +137,7 @@ def test( block_size=_PAGED_KV_BLOCK_SIZE, ) else: - batch_size = 1 if prompts is str else len(prompts) + batch_size = 1 if isinstance(prompts, str) else len(prompts) initial_capacity = max_new_tokens + len(input_ids_list[0]) cache_config = StaticKVCacheConfig( max_batch_size=batch_size, max_cache_len=initial_capacity @@ -223,7 +179,7 @@ def test( ) pixel_values_infini = infinicore.from_torch(pixel_values_tensor) - # 2. tgt_sizes + # 2. Target sizes all_tgt_sizes = [ tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor) ] @@ -232,7 +188,7 @@ def test( tgt_sizes_infini = infinicore.from_torch(tgt_sizes_tensor) - # 3. image_bound + # 3. Image bounds batch_size = len(image_bound) max_ranges = max(len(b) for b in image_bound) diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py index f552a2cc..51edf681 100644 --- a/python/infinilm/__init__.py +++ b/python/infinilm/__init__.py @@ -3,6 +3,7 @@ from . import cache from . import llm from . import base_config +from . import tokenizer_utils from .llm import ( LLM, @@ -18,6 +19,7 @@ "cache", "llm", "base_config", + "tokenizer_utils", # LLM classes "LLM", "AsyncLLMEngine", diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index b640cf20..816b353f 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -14,9 +14,6 @@ from typing import List, Optional, Union, AsyncIterator from dataclasses import dataclass -from transformers import AutoTokenizer -from tokenizers import decoders as _dec - import infinicore from infinilm.llm.request import ( @@ -33,6 +30,7 @@ from infinilm.infer_engine import InferEngine from infinilm.cache.cache import PagedKVCacheConfig, StaticKVCacheConfig from infinilm.modeling_utils import load_model_state_dict_by_file +from infinilm.tokenizer_utils import InfiniLMTokenizer logger = logging.getLogger(__name__) @@ -99,11 +97,8 @@ def __init__(self, config: EngineConfig): self.model_engine, config.model_path, dtype=self.model_engine.dtype ) - # Initialize tokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - config.model_path, trust_remote_code=True - ) - self._fix_tokenizer_decoder() + # Initialize tokenizer using InfiniLMTokenizer + self.tokenizer = InfiniLMTokenizer(config.model_path) # Initialize KV cache based on cache type if config.cache_type == "static": @@ -130,14 +125,14 @@ def __init__(self, config: EngineConfig): self.model_engine.reset_cache(cache_config) self.cache_type = config.cache_type - # Get EOS token IDs from model config - self.eos_token_ids = self.model_engine.eos_token_id or [] + # Get EOS token IDs from tokenizer + self.eos_token_ids = self.tokenizer.eos_token_id or [] if isinstance(self.eos_token_ids, int): self.eos_token_ids = [self.eos_token_ids] logger.info( f"LLMEngine initialized with model at {config.model_path} " - f"on device {config.device}" + f"on device {config.device} " f"enable_graph={config.enable_graph}" ) @@ -166,26 +161,6 @@ def _init_device(self): self.dtype = dtype_map[self.config.dtype] - def _fix_tokenizer_decoder(self): - """Fix tokenizer decoder for llama models.""" - if "llama" in self.model_engine.model_type.lower(): - backend = getattr(self.tokenizer, "backend_tokenizer", None) - target = getattr(backend, "_tokenizer", backend) - norm = getattr(target, "normalizer", None) - dec = getattr(target, "decoder", None) - sn = repr(norm)[:800] if norm is not None else "" - sd = repr(dec)[:800] if dec is not None else "" - has_prepend = "Prepend" in sn - has_strip = "Strip" in sd - if has_prepend and has_strip: - target.decoder = _dec.Sequence( - [ - _dec.Replace("▁", " "), - _dec.ByteFallback(), - _dec.Fuse(), - ] - ) - def add_request(self, request: InferenceRequest): """Add a request to the scheduler.""" self.scheduler.add_request(request) @@ -341,7 +316,8 @@ def _check_request_finished(self, req: InferenceRequest, token_id: int) -> bool: req.finish_reason = FinishReason.EOS_TOKEN return True - # While ignoring EOS, stop strings are also ignored to avoid requiring additional arguments for benchmarking. + # While ignoring EOS, stop strings are also ignored to avoid requiring + # additional arguments for benchmarking. # Check stop strings # Remove stop string from generated_text if STOP_STRING is the finishing reason stop_strings = req.sampling_params.stop or [] @@ -354,11 +330,11 @@ def _check_request_finished(self, req: InferenceRequest, token_id: int) -> bool: return False def tokenize(self, text: str) -> List[int]: - """Tokenize text to token IDs.""" + """Tokenize text to token IDs using InfiniLMTokenizer.""" return self.tokenizer.encode(text) def detokenize(self, token_ids: List[int]) -> str: - """Detokenize token IDs to text.""" + """Detokenize token IDs to text using InfiniLMTokenizer.""" return self.tokenizer.decode(token_ids) def apply_chat_template( @@ -367,7 +343,7 @@ def apply_chat_template( add_generation_prompt: bool = True, chat_template_kwargs: Optional[dict] = None, ) -> str: - """Apply chat template to messages.""" + """Apply chat template to messages using InfiniLMTokenizer.""" chat_template_kwargs = chat_template_kwargs or {} return self.tokenizer.apply_chat_template( conversation=messages, diff --git a/python/infinilm/tokenizer_utils.py b/python/infinilm/tokenizer_utils.py new file mode 100644 index 00000000..ad36c8b9 --- /dev/null +++ b/python/infinilm/tokenizer_utils.py @@ -0,0 +1,336 @@ +""" +Tokenizer utilities for InfiniLM. + +This module provides InfiniLMTokenizer class that encapsulates all tokenizer +operations including initialization, encoding/decoding, chat template handling, +and model-specific fixes. +""" + +import os +import json +from typing import List, Optional, Union, Any + +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from tokenizers import decoders as _dec + + +class InfiniLMTokenizer: + """Unified tokenizer wrapper for InfiniLM. + + This class encapsulates all tokenizer-related operations including: + - Model-specific initialization and fixes + - Encoding/decoding + - Chat template application + - Pad token configuration + + Attributes: + tokenizer: The underlying HuggingFace tokenizer instance. + model_type: The model type string (e.g., 'llama', 'qwen2', 'minicpm'). + eos_token_id: End-of-sequence token ID(s). + """ + + def __init__( + self, + model_path: str, + trust_remote_code: Optional[bool] = None, + **kwargs: Any, + ): + """Initialize the tokenizer for a given model. + + Args: + model_path: Path to the model directory containing config.json and tokenizer files. + trust_remote_code: Whether to trust remote code. + If None, will be determined based on model type. + **kwargs: Additional keyword arguments passed to AutoTokenizer.from_pretrained(). + + Raises: + FileNotFoundError: If config.json is not found in model_path. + ValueError: If tokenizer initialization fails. + """ + self._model_path = os.path.expanduser(model_path) + + # Load model config to determine model type + config_path = os.path.join(self._model_path, "config.json") + if not os.path.isfile(config_path): + raise FileNotFoundError(f"config.json not found in {self._model_path}") + + with open(config_path, "r", encoding="utf-8") as f: + config_dict = json.load(f) + + self.model_type = config_dict.get("model_type", "").lower() + + # Determine trust_remote_code based on model type if not explicitly specified + if trust_remote_code is None: + trust_remote_code = self._should_trust_remote_code(self.model_type) + + # Initialize the underlying tokenizer + try: + self.tokenizer = AutoTokenizer.from_pretrained( + self._model_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) + except Exception as e: + raise ValueError( + f"Failed to initialize tokenizer from {self._model_path}: {e}" + ) + + # Apply model-specific fixes + self._apply_model_specific_fixes() + + # Configure pad token + self._configure_pad_token() + + # Extract EOS token ID from config + eos_token_id = config_dict.get("eos_token_id") + if eos_token_id is not None: + self.eos_token_id = ( + [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id + ) + else: + self.eos_token_id = [] + + # Ensure EOS token ID is always a list + if isinstance(self.eos_token_id, int): + self.eos_token_id = [self.eos_token_id] + + @staticmethod + def _should_trust_remote_code(model_type: str) -> bool: + """Determine if trust_remote_code should be True based on model type. + + Some model types require custom tokenizer code (trust_remote_code=True), + while others work with standard tokenizers. + + Args: + model_type: The model type string. + + Returns: + True if trust_remote_code should be enabled, False otherwise. + """ + # Models that typically require custom code + require_trust = { + "fm9g", + "minicpm", + "fm9g7b", + "minicpmv", + "qwen2", + "qwen3", + } + + # Models that work with standard tokenizers (no trust_remote_code needed) + standard_models = { + "llama", + "mistral", + "gemma", + } + + if model_type in require_trust: + return True + elif model_type in standard_models: + return False + else: + # Default to True for unknown model types + return True + + def _apply_model_specific_fixes(self) -> None: + """Apply model-specific tokenizer fixes. + + Currently handles: + - Llama models: Fix decoder to handle space replacement properly. + """ + if self.model_type == "llama": + self._fix_llama_decoder() + + def _fix_llama_decoder(self) -> None: + """Fix Llama tokenizer decoder for proper space handling. + + Llama tokenizers often have a decoder that prepends spaces, causing + double spaces or incorrect spacing in decoded text. This fix replaces + the decoder with one that handles spaces correctly. + """ + backend = getattr(self.tokenizer, "backend_tokenizer", None) + target = getattr(backend, "_tokenizer", backend) if backend else None + + if target is None: + return + + norm = getattr(target, "normalizer", None) + dec = getattr(target, "decoder", None) + + sn = repr(norm)[:800] if norm is not None else "" + sd = repr(dec)[:800] if dec is not None else "" + + has_prepend = "Prepend" in sn + has_strip = "Strip" in sd + + if has_prepend and has_strip: + target.decoder = _dec.Sequence( + [ + _dec.Replace("▁", " "), + _dec.ByteFallback(), + _dec.Fuse(), + ] + ) + + def _configure_pad_token(self) -> None: + """Configure pad token and pad_token_id for the tokenizer. + + If no pad_token is set, this method tries to use eos_token as pad_token. + If that fails, it adds a new [PAD] token. + """ + if self.tokenizer.pad_token is not None: + return + + if self.tokenizer.eos_token is not None: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + else: + # Add a new pad token + self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + def encode(self, text: Union[str, List[str]]) -> List[int]: + """Encode text(s) into token IDs. + + Args: + text: A single text string or a list of text strings. + + Returns: + If input is a single string, returns a list of token IDs. + If input is a list of strings, returns a list of lists of token IDs. + + Raises: + TypeError: If text is not a string or list of strings. + """ + if isinstance(text, str): + return self.tokenizer.encode(text) + elif isinstance(text, list): + return [self.tokenizer.encode(t) for t in text] + else: + raise TypeError(f"Expected str or List[str], got {type(text).__name__}") + + def decode( + self, + token_ids: Union[List[int], Any], + skip_special_tokens: bool = True, + **kwargs: Any, + ) -> str: + """Decode token IDs to text. + + Args: + token_ids: List of token IDs to decode, or any iterable of integers. + skip_special_tokens: Whether to skip special tokens in decoding. + **kwargs: Additional keyword arguments passed to tokenizer.decode(). + + Returns: + Decoded text string. + """ + # Convert to list if necessary (e.g., from numpy array or tensor) + if not isinstance(token_ids, list): + try: + token_ids = list(token_ids) + except TypeError: + token_ids = [token_ids] + + return self.tokenizer.decode( + token_ids, skip_special_tokens=skip_special_tokens, **kwargs + ) + + def apply_chat_template( + self, + conversation: List[dict], + add_generation_prompt: bool = True, + tokenize: bool = False, + **kwargs: Any, + ) -> Union[str, List[int]]: + """Apply chat template to a conversation. + + Args: + conversation: List of message dicts with 'role' and 'content' keys. + Example: [{"role": "user", "content": "Hello"}] + add_generation_prompt: Whether to add generation prompt at the end. + tokenize: Whether to tokenize the output. + **kwargs: Additional keyword arguments passed to tokenizer.apply_chat_template(). + + Returns: + Formatted conversation string, or list of token IDs if tokenize=True. + + Raises: + ValueError: If the tokenizer does not have a chat template and tokenize=False. + """ + if ( + hasattr(self.tokenizer, "chat_template") + and self.tokenizer.chat_template is not None + ): + return self.tokenizer.apply_chat_template( + conversation=conversation, + add_generation_prompt=add_generation_prompt, + tokenize=tokenize, + **kwargs, + ) + else: + # Fallback: construct a simple prompt from the conversation + text_parts = [] + for msg in conversation: + role = msg.get("role", "user") + content = msg.get("content", "") + if role == "system": + text_parts.append(f"System: {content}") + elif role == "user": + text_parts.append(f"User: {content}") + elif role == "assistant": + text_parts.append(f"Assistant: {content}") + else: + text_parts.append(f"{role}: {content}") + + text = "\n".join(text_parts) + if add_generation_prompt: + text += "\nAssistant: " + + if tokenize: + return self.encode(text) + return text + + @property + def model_max_length(self) -> int: + """Get the maximum sequence length supported by the model.""" + return getattr(self.tokenizer, "model_max_length", 2048) + + @property + def vocab_size(self) -> int: + """Get the vocabulary size.""" + return self.tokenizer.vocab_size + + @property + def pad_token_id(self) -> Optional[int]: + """Get the pad token ID.""" + return self.tokenizer.pad_token_id + + @property + def eos_token(self) -> Optional[str]: + """Get the EOS token string.""" + return self.tokenizer.eos_token + + def get_tokenizer(self) -> PreTrainedTokenizer: + """Get the underlying HuggingFace tokenizer instance. + + Returns: + The PreTrainedTokenizer or PreTrainedTokenizerFast instance. + """ + return self.tokenizer + + def __repr__(self) -> str: + return ( + f"InfiniLMTokenizer(model_type='{self.model_type}', " + f"vocab_size={self.vocab_size}, " + f"model_max_length={self.model_max_length})" + ) + + +# Legacy function for backward compatibility +def infinilm_encode(tokenizer, text): + """Encode text into token ids using the provided tokenizer. + + Deprecated: Use InfiniLMTokenizer.encode() instead. + This function is kept for backward compatibility. + """ + return tokenizer.encode(text) diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py index a3adf487..ed8a0982 100644 --- a/test/bench/test_benchmark.py +++ b/test/bench/test_benchmark.py @@ -9,40 +9,44 @@ from datasets import load_dataset, Dataset from abc import ABC, abstractmethod from infinilm.base_config import BaseConfig +from infinilm.tokenizer_utils import InfiniLMTokenizer TOTAL_TOKENS = 0 TOTAL_TIME = 0.0 class BaseBenchmark(ABC): - """Base class for benchmark evaluation with common tokenizer and generation utilities""" + """Base class for benchmark evaluation with common tokenizer and generation utilities.""" + + def __init__(self): + self.tokenizer: InfiniLMTokenizer = None def encode_text(self, text): - """Encode text to token IDs - reused across backends""" + """Encode text to token IDs using InfiniLMTokenizer.""" return self.tokenizer.encode(text) - def decode_token(self, token_id): - """Decode token ID to text - reused across backends""" - return self.tokenizer.decode(token_id) + def decode_token(self, token_ids): + """Decode token IDs to text using InfiniLMTokenizer.""" + return self.tokenizer.decode(token_ids) @abstractmethod def render_input_content(self, *args, **kwargs): - """Render input content - benchmark-specific implementation""" + """Render input content - benchmark-specific implementation.""" pass @abstractmethod def generate(self, *args, **kwargs): - """Generate response - benchmark-specific implementation""" + """Generate response - benchmark-specific implementation.""" pass @abstractmethod def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): - """Backend-specific generation implementation""" + """Backend-specific generation implementation.""" pass class InfiniLMBenchmark(BaseBenchmark): - """Wrapper class for InfiniLM cpp backend for benchmark evaluation""" + """Wrapper class for InfiniLM cpp backend for benchmark evaluation.""" def __init__( self, @@ -55,7 +59,6 @@ def __init__( enable_graph=False, attn_backend="default", ): - import transformers import infinicore from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.distributed import DistConfig @@ -86,38 +89,15 @@ def __init__( # So device index 0 will automatically map to the first visible device self.device = infinicore.device(device_name, 0) - # Load config and tokenizer + # Initialize tokenizer using InfiniLMTokenizer + self.tokenizer = InfiniLMTokenizer(model_dir_path) + + # Load config for model parameters with open(os.path.join(model_dir_path, "config.json"), "r") as f: self.config_dict = json.load(f) - # Align tokenizer initialization with jiuge backend (010) - # Match the exact same initialization logic based on model type - model_type = self.config_dict.get("model_type", "") - if model_type == "llama": - # For llama models: no trust_remote_code (matches jiuge line 465) - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - model_dir_path, trust_remote_code=True - ) - elif model_type in ["fm9g", "minicpm", "fm9g7b"]: - # For fm9g/minicpm/fm9g7b models: use trust_remote_code=True (matches jiuge lines 493-495, 518-520) - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - model_dir_path, trust_remote_code=True - ) - elif model_type in ["qwen2", "qwen3"]: - # For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536) - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - model_dir_path, trust_remote_code=True - ) - else: - # Default: use trust_remote_code=True for other models - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - model_dir_path, trust_remote_code=True - ) - - eos_token_id = self.config_dict.get("eos_token_id") - self.eos_token_id = ( - [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id - ) + # Get EOS token IDs from tokenizer manager + self.eos_token_id = self.tokenizer.eos_token_id if backend != "cpp": raise ValueError(f"Unsupported backend: {backend}.") @@ -154,10 +134,11 @@ def __init__( print("Model loaded successfully") def max_context_len(self): + """Get maximum context length from model config.""" return self.config_dict.get("max_position_embeddings", 2048) def render_input_content(self, *args, **kwargs): - """Render input content based on benchmark type""" + """Render input content based on benchmark type.""" if self.benchmark == "ceval": return render_ceval(self.tokenizer, *args, **kwargs) elif self.benchmark == "mmlu": @@ -166,12 +147,12 @@ def render_input_content(self, *args, **kwargs): raise ValueError(f"Unknown benchmark: {self.benchmark}") def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0): - """Generate response based on benchmark type""" + """Generate response based on benchmark type.""" # Render input content input_content = self.render_input_content(*args) print(input_content, end="", flush=True) - # Encode input + # Encode input using InfiniLMTokenizer tokens = self.encode_text(input_content) # Delegate to backend-specific generation implementation @@ -183,7 +164,7 @@ def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0): def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): """ - InfiniLM cpp backend-specific generation implementation + InfiniLM cpp backend-specific generation implementation. NOTE: Validation confirmed input configs are identical between backends. The issue was that manual generation loop called InferEngine.generate() which @@ -227,6 +208,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): # ---- post process ---- generated_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids]) + # Use InfiniLMTokenizer for decoding output_text = self.tokenizer.decode(generated_ids) # ---- stats ---- @@ -250,13 +232,13 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): return output_text def destroy_model_instance(self): - # Cleanup if needed + """Cleanup model resources.""" del self.model print("Model destroyed") class TorchBenchmark(BaseBenchmark): - """Torch backend using HuggingFace Transformers""" + """Torch backend using HuggingFace Transformers.""" def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"): import torch @@ -264,7 +246,7 @@ def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"): self.benchmark = benchmark - # Device + # Device setup if device_type_str == "nvidia": self.device = torch.device("cuda") elif device_type_str == "cpu": @@ -307,9 +289,11 @@ def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"): ) def max_context_len(self): + """Get maximum context length from model config.""" return self.config_dict.get("max_position_embeddings", 2048) def render_input_content(self, *args, **kwargs): + """Render input content based on benchmark type.""" if self.benchmark == "ceval": return render_ceval(self.tokenizer, *args, **kwargs) elif self.benchmark == "mmlu": @@ -318,6 +302,7 @@ def render_input_content(self, *args, **kwargs): raise ValueError(f"Unknown benchmark: {self.benchmark}") def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): + """Torch backend-specific generation implementation.""" import torch import time @@ -339,7 +324,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): pad_token_id=2, ) - # --- end sync --- + # Sync for accurate timing if self.device.type == "cuda": torch.cuda.synchronize() @@ -370,20 +355,23 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): return output_text def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0): + """Generate response based on benchmark type.""" input_content = self.render_input_content(*args) print(input_content, end="", flush=True) + # Encode input using InfiniLMTokenizer tokens = self.encode_text(input_content) return self._generate_step(tokens, max_steps, topp_, topk_, temperature_) def destroy_model_instance(self): + """Cleanup model resources.""" del self.model print("Torch model destroyed") class VLLMBenchmark(BaseBenchmark): - """vLLM backend using vllm.LLM""" + """vLLM backend using vllm.LLM.""" def __init__( self, @@ -429,9 +417,11 @@ def __init__( print("vLLM model loaded successfully") def max_context_len(self): + """Get maximum context length from model config.""" return self.config_dict.get("max_position_embeddings", 2048) def render_input_content(self, *args, **kwargs): + """Render input content based on benchmark type.""" if self.benchmark == "ceval": return render_ceval(self.tokenizer, *args, **kwargs) elif self.benchmark == "mmlu": @@ -440,15 +430,19 @@ def render_input_content(self, *args, **kwargs): raise ValueError(f"Unknown benchmark: {self.benchmark}") def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0): + """Generate response based on benchmark type.""" input_content = self.render_input_content(*args) print(input_content, end="", flush=True) + # Encode input using InfiniLMTokenizer tokens = self.encode_text(input_content) return self._generate_step(tokens, max_steps, topp_, topk_, temperature_) def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): + """vLLM backend-specific generation implementation.""" from vllm import SamplingParams + # Decode prompt from tokens prompt = self.tokenizer.decode(tokens) sampling_params = SamplingParams( @@ -473,6 +467,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): # ---- stats ---- input_tokens = len(tokens) + # Use InfiniLMTokenizer for encoding to count tokens new_tokens = len(self.encode_text(output_text)) total_tokens = input_tokens + new_tokens @@ -494,12 +489,13 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_): return output_text def destroy_model_instance(self): + """Cleanup model resources.""" del self.llm print("vLLM model destroyed") def render_ceval(_tokenizer, conversation): - """Render C-Eval conversation to input content""" + """Render C-Eval conversation to input content.""" return ( _tokenizer.apply_chat_template( conversation=conversation, @@ -511,7 +507,7 @@ def render_ceval(_tokenizer, conversation): def render_mmlu(_tokenizer, question, choices): - """Render MMLU question and choices to input content""" + """Render MMLU question and choices to input content.""" choices_text = "\n".join( [f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)] ) @@ -542,7 +538,7 @@ def render_mmlu(_tokenizer, question, choices): def extract_answer_ceval(output_content, answer): - """Extract predicted answer from C-Eval output""" + """Extract predicted answer from C-Eval output.""" output_upper = output_content.upper().strip() position = 0 ABCD = output_upper[position : position + 2] @@ -550,7 +546,7 @@ def extract_answer_ceval(output_content, answer): def extract_answer_mmlu(output_content): - """Extract predicted answer from MMLU output (returns 0-3 index or None)""" + """Extract predicted answer from MMLU output (returns 0-3 index or None).""" output_upper = output_content.upper().strip() # Find first meaningful token @@ -565,7 +561,7 @@ def extract_answer_mmlu(output_content): def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=None): - """Evaluate samples for a single subject and return results""" + """Evaluate samples for a single subject and return results.""" answers_list = [] for idx, sample in enumerate(samples): if benchmark == "ceval": @@ -640,7 +636,7 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non true_num = 0 all_num = 0 for cont in answers_list: - id = cont["id"] + idx = cont["id"] all_num = all_num + 1 if benchmark == "ceval": @@ -648,16 +644,16 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non is_correct = cont["is_correct"] if is_correct: true_num = true_num + 1 - print(f"id {id} : ", "正确") + print(f"id {idx} : ", "正确") else: - print(f"id {id}: ", "错误") + print(f"id {idx}: ", "错误") elif benchmark == "mmlu": answer = cont["answer"] predicted = cont["predicted"] if predicted is not None and predicted == answer: true_num = true_num + 1 - print(f"id {id}: Correct") + print(f"id {idx}: Correct") else: answer_letter = chr(65 + answer) if answer < 4 else "?" predicted_letter = ( @@ -666,7 +662,7 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non else "?" ) print( - f"id {id}: Wrong (correct: {answer_letter}, predicted: {predicted_letter})" + f"id {idx}: Wrong (correct: {answer_letter}, predicted: {predicted_letter})" ) accuracy = true_num / all_num if all_num > 0 else 0.0 @@ -900,7 +896,7 @@ def _load_ceval_subject(subj): return _load_ceval_from_cache( args.cache_dir, subj, args.split, ceval_subjects ) - # online fallback via HF load_dataset + # Online fallback via HF load_dataset if args.split == "all": records = [] for split_name in ["val", "test"]: @@ -1104,7 +1100,7 @@ def load_subject_samples(subj_name): def main(): - """Main function""" + """Main function.""" cfg = BaseConfig() device_type_str = cfg.device