diff --git a/examples/bench.py b/examples/bench.py
index 453c64cd..1d00f326 100644
--- a/examples/bench.py
+++ b/examples/bench.py
@@ -1,10 +1,10 @@
 import infinicore
-from transformers import AutoTokenizer
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
 from infinilm.base_config import BaseConfig
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 import argparse
 import sys
 import time
@@ -42,9 +42,9 @@ def get_test_cases(
     input_len_list: list[int],
     output_len_list: list[int],
 ):
+    """Generate cases ordered by ascending KV cache memory usage."""
     model_path = os.path.expanduser(model_path)
 
-    """Generate cases ordered by ascending KV cache memory usage."""
     # Load model config to derive attention dimensions
     config = read_json_file(os.path.join(model_path, "config.json"))
     head_dim = config.get(
@@ -92,19 +92,23 @@ def get_test_cases(
     return case_dict
 
 
+# Load benchmark prompt from file
 with open("examples/bench_prompt.md", "r") as f:
     prompt = f.read()
 
 
 def repeat_prompt(input_ids: list[int], target_length: int):
+    """Repeat or truncate input_ids to match target_length."""
     num = len(input_ids)
     repeat_times = (target_length + num - 1) // num
     return (input_ids * repeat_times)[:target_length]
 
 
 class TestModel:
+    """Benchmark model wrapper for performance testing."""
+
     model: infinicore.nn.Module
-    tokenizer: AutoTokenizer
+    tokenizer: InfiniLMTokenizer
     input_ids_list: list[int]
 
     def __init__(
@@ -118,8 +122,9 @@ def __init__(
         attn_backend="default",
     ) -> None:
         model_path = os.path.expanduser(model_path)
+
         # ---------------------------------------------------------------------------- #
-        #                        创建模型,
+        #                        Create Model
         # ---------------------------------------------------------------------------- #
         model = InferEngine(
             model_path,
@@ -132,47 +137,30 @@ def __init__(
         )
 
         # ---------------------------------------------------------------------------- #
-        #                        加载权重
+        #                        Load Weights
         # ---------------------------------------------------------------------------- #
         if not skip_load:
             load_model_state_dict_by_file(model, model_path, dtype=model.dtype)
 
         # ---------------------------------------------------------------------------- #
-        #                        创建 tokenizer
+        #                        Initialize Tokenizer
         # ---------------------------------------------------------------------------- #
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-        if tokenizer.pad_token is None:
-            if tokenizer.eos_token is not None:
-                tokenizer.pad_token = tokenizer.eos_token
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        self.tokenizer = InfiniLMTokenizer(model_path)
 
         # ---------------------------------------------------------------------------- #
-        #                        token编码
+        #                        Encode Prompt
         # ---------------------------------------------------------------------------- #
         input_content = [
-            tokenizer.apply_chat_template(
+            self.tokenizer.apply_chat_template(
                 conversation=[{"role": "user", "content": prompt}],
                 add_generation_prompt=True,
                 tokenize=False,
             )
         ]
 
-        # print(input_content, end="", flush=True)
-        # Support Transformers >= 5.0 for batch_encode_plus deprecation
-        encoding = tokenizer(
-            input_content,
-            padding=True,
-            truncation=True,
-            max_length=8192,
-        )
-
-        input_ids_list = encoding["input_ids"]
+        input_ids_list = self.tokenizer.encode(input_content)
 
         self.model = model
-        self.tokenizer = tokenizer
         self.input_ids_list = input_ids_list
 
     def run(
@@ -184,11 +172,12 @@ def run(
         top_p=1.0,
         temperature=1.0,
     ):
+        """Run a single benchmark test case."""
         input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
         input_ids_list = [input_ids] * batch_size
 
         # ---------------------------------------------------------------------------- #
-        #                        自回归生成
+        #                        Autoregressive Generation
         # ---------------------------------------------------------------------------- #
         input_ids_infini = infinicore.from_list(input_ids_list)
 
@@ -211,6 +200,7 @@ def run(
         numpy_output_ids = np.array(
             [output_id.to_numpy()[0] for output_id in output_ids]
         )
+        # Use InfiniLMTokenizer for decoding
         print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
 
         print(
@@ -224,8 +214,9 @@ def run(
     device_str = cfg.get_device_str(cfg.device)
 
     _PAGED_KV_BLOCK_SIZE = cfg.block_size
+
     # -------------------------------------------------------- #
-    #             解析参数
+    #             Parse Arguments
     # -------------------------------------------------------- #
     model_path = cfg.model
 
@@ -252,8 +243,9 @@ def run(
         output_len = [output_len]
 
     cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)
+
     # -------------------------------------------------------- #
-    #             测试
+    #             Initialize Test Configuration
     # -------------------------------------------------------- #
     if enable_paged_attn:
         paged_kv_block_size = _PAGED_KV_BLOCK_SIZE
@@ -290,7 +282,7 @@ def run(
     if cfg.warmup:
         warmup_steps = 1
 
-        # warmup cache capacity
+        # Warmup cache capacity
         warmup_cache_len = 128
         warmup_batch = len(test.input_ids_list)
 
@@ -316,7 +308,7 @@ def run(
             _ = test.model.generate(
                 input_ids_infini,
                 GenerationConfig(
-                    max_new_tokens=5,  # decode kernel warmup
+                    max_new_tokens=5,  # Decode kernel warmup
                     temperature=cfg.temperature,
                     top_k=cfg.top_k,
                     top_p=cfg.top_p,
@@ -327,12 +319,12 @@ def run(
 
         print("=================== warmup done ====================")
 
-        # reset cache back to benchmark config
+        # Reset cache back to benchmark config
         if cache_config is not None:
             test.model.reset_cache(cache_config)
 
     # ---------------------------------------------------------------------------- #
-    #                                Warmup done
+    #                                Run Benchmarks
     # ---------------------------------------------------------------------------- #
 
     for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
@@ -343,7 +335,7 @@ def run(
         output_len = case["output_len"]
 
         if not enable_paged_attn:
-            # reset cache if static kvcache is used
+            # Reset cache if static KV cache is used
             initial_capacity = input_len + output_len
             test.model.reset_cache(
                 StaticKVCacheConfig(
@@ -351,7 +343,7 @@ def run(
                 )
             )
 
-        # run test one case
+        # Run test for one case
         test.run(
             batch_size=batch_size,
             input_len=input_len,
diff --git a/examples/test_infer.py b/examples/test_infer.py
index abec5d00..a3551efc 100644
--- a/examples/test_infer.py
+++ b/examples/test_infer.py
@@ -1,18 +1,16 @@
 import infinicore
-import transformers
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
 import argparse
 import sys
 import time
 import os
 import numpy as np
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
-from packaging import version
+
 from infinilm.base_config import BaseConfig
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 
 from PIL import Image
 import torch
@@ -37,6 +35,7 @@ def test(
     image_path=None,
 ):
     model_path = os.path.expanduser(model_path)
+
     # ---------------------------------------------------------------------------- #
     #                        Create Model
     # ---------------------------------------------------------------------------- #
@@ -51,16 +50,18 @@ def test(
         attention_backend=attn_backend,
         kv_cache_dtype=cfg.kv_cache_dtype,
     )
+
     # ---------------------------------------------------------------------------- #
     #                        Load Weights
     # ---------------------------------------------------------------------------- #
     load_model_state_dict_by_file(model, model_path, dtype=model.dtype)
 
     # ---------------------------------------------------------------------------- #
-    #                        create tokenizer
+    #                        Initialize Tokenizer
     # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = InfiniLMTokenizer(model_path)
 
+    # Initialize processor for multimodal models
     processor = None
     if image_path is not None:
         if model.model_type == "minicpmv":
@@ -69,33 +70,14 @@ def test(
             processor = AutoProcessor.from_pretrained(
                 model_path, trust_remote_code=True
             )
-            tokenizer = processor.tokenizer
-
-    if "llama" == model.model_type:
-        backend = getattr(tokenizer, "backend_tokenizer", None)
-        target = getattr(backend, "_tokenizer", backend)
-        norm = getattr(target, "normalizer", None)
-        dec = getattr(target, "decoder", None)
-        sn = repr(norm)[:800] if norm is not None else ""
-        sd = repr(dec)[:800] if dec is not None else ""
-        has_prepend = "Prepend" in sn
-        has_strip = "Strip" in sd
-        if has_prepend and has_strip:
-            target.decoder = _dec.Sequence(
-                [
-                    _dec.Replace("▁", " "),
-                    _dec.ByteFallback(),
-                    _dec.Fuse(),
-                ]
-            )
 
     # ---------------------------------------------------------------------------- #
-    #                        tokenize
+    #                        Tokenize Inputs
     # ---------------------------------------------------------------------------- #
-    # prompt = "山东最高的山是？"
     if isinstance(prompts, str):
         prompts = [prompts]
 
+    # Handle image prompts for multimodal models
     if image_path is not None:
         updated_prompts = []
         for prompt in prompts:
@@ -104,18 +86,17 @@ def test(
             updated_prompts.append(prompt)
         prompts = updated_prompts
 
-    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
-        input_contents = [
-            tokenizer.apply_chat_template(
-                conversation=[{"role": "user", "content": prompt}],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
-            for prompt in prompts
-        ]
-    else:
-        input_contents = prompts
+    # Apply chat template or use raw prompts
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
 
+    # Process multimodal inputs or encode text
     pixel_values = None
     image_bound = None
     tgt_sizes = None
@@ -139,39 +120,14 @@ def test(
         else:
             raise ValueError(f"Unsupported multimodal model_type: {model.model_type}")
     else:
-        if hasattr(tokenizer, "batch_encode_plus"):
-            input_ids_list = tokenizer.batch_encode_plus(input_contents)["input_ids"]
-        elif hasattr(tokenizer, "_encode_plus"):
-            input_ids_list = tokenizer._encode_plus(input_contents)["input_ids"]
-        else:
-            input_ids_list = tokenizer(input_contents)[
-                "input_ids"
-            ]  # List: [[1, 1128, 526, 366, 29892]]
-
-        # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-        #     "input_ids"
-        # ]  # List: [[1, 1128, 526, 366, 29892]]
-        if version.parse(transformers.__version__) < version.parse("5.0.0"):
-            # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
-            input_ids_list = [
-                tokenizer.encode_plus(
-                    text, truncation=True, max_length=2048, add_special_tokens=True
-                )["input_ids"]
-                for text in input_contents
-            ]
-        else:
-            input_ids_list = [
-                tokenizer._encode_plus(
-                    text, truncation=True, max_length=2048, add_special_tokens=True
-                )["input_ids"]
-                for text in input_contents
-            ]
+        # Use InfiniLMTokenizer for encoding
+        input_ids_list = tokenizer.encode(input_contents)
 
     # ---------------------------------------------------------------------------- #
-    #                       Create KVCache
+    #                       Create KV Cache
     # ---------------------------------------------------------------------------- #
     if enable_paged_attn:
-        batch_size = 1 if prompts is str else len(prompts)
+        batch_size = 1 if isinstance(prompts, str) else len(prompts)
         max_total_tokens = max_new_tokens + len(input_ids_list[0])
         cache_config = PagedKVCacheConfig(
             num_blocks=(
@@ -181,7 +137,7 @@ def test(
             block_size=_PAGED_KV_BLOCK_SIZE,
         )
     else:
-        batch_size = 1 if prompts is str else len(prompts)
+        batch_size = 1 if isinstance(prompts, str) else len(prompts)
         initial_capacity = max_new_tokens + len(input_ids_list[0])
         cache_config = StaticKVCacheConfig(
             max_batch_size=batch_size, max_cache_len=initial_capacity
@@ -223,7 +179,7 @@ def test(
             )
             pixel_values_infini = infinicore.from_torch(pixel_values_tensor)
 
-            # 2. tgt_sizes
+            # 2. Target sizes
             all_tgt_sizes = [
                 tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)
             ]
@@ -232,7 +188,7 @@ def test(
 
             tgt_sizes_infini = infinicore.from_torch(tgt_sizes_tensor)
 
-            # 3. image_bound
+            # 3. Image bounds
             batch_size = len(image_bound)
             max_ranges = max(len(b) for b in image_bound)
 
diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py
index f552a2cc..51edf681 100644
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
@@ -3,6 +3,7 @@
 from . import cache
 from . import llm
 from . import base_config
+from . import tokenizer_utils
 
 from .llm import (
     LLM,
@@ -18,6 +19,7 @@
     "cache",
     "llm",
     "base_config",
+    "tokenizer_utils",
     # LLM classes
     "LLM",
     "AsyncLLMEngine",
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index b640cf20..816b353f 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -14,9 +14,6 @@
 from typing import List, Optional, Union, AsyncIterator
 from dataclasses import dataclass
 
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-
 import infinicore
 
 from infinilm.llm.request import (
@@ -33,6 +30,7 @@
 from infinilm.infer_engine import InferEngine
 from infinilm.cache.cache import PagedKVCacheConfig, StaticKVCacheConfig
 from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -99,11 +97,8 @@ def __init__(self, config: EngineConfig):
             self.model_engine, config.model_path, dtype=self.model_engine.dtype
         )
 
-        # Initialize tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            config.model_path, trust_remote_code=True
-        )
-        self._fix_tokenizer_decoder()
+        # Initialize tokenizer using InfiniLMTokenizer
+        self.tokenizer = InfiniLMTokenizer(config.model_path)
 
         # Initialize KV cache based on cache type
         if config.cache_type == "static":
@@ -130,14 +125,14 @@ def __init__(self, config: EngineConfig):
         self.model_engine.reset_cache(cache_config)
         self.cache_type = config.cache_type
 
-        # Get EOS token IDs from model config
-        self.eos_token_ids = self.model_engine.eos_token_id or []
+        # Get EOS token IDs from tokenizer
+        self.eos_token_ids = self.tokenizer.eos_token_id or []
         if isinstance(self.eos_token_ids, int):
             self.eos_token_ids = [self.eos_token_ids]
 
         logger.info(
             f"LLMEngine initialized with model at {config.model_path} "
-            f"on device {config.device}"
+            f"on device {config.device} "
             f"enable_graph={config.enable_graph}"
         )
 
@@ -166,26 +161,6 @@ def _init_device(self):
 
         self.dtype = dtype_map[self.config.dtype]
 
-    def _fix_tokenizer_decoder(self):
-        """Fix tokenizer decoder for llama models."""
-        if "llama" in self.model_engine.model_type.lower():
-            backend = getattr(self.tokenizer, "backend_tokenizer", None)
-            target = getattr(backend, "_tokenizer", backend)
-            norm = getattr(target, "normalizer", None)
-            dec = getattr(target, "decoder", None)
-            sn = repr(norm)[:800] if norm is not None else ""
-            sd = repr(dec)[:800] if dec is not None else ""
-            has_prepend = "Prepend" in sn
-            has_strip = "Strip" in sd
-            if has_prepend and has_strip:
-                target.decoder = _dec.Sequence(
-                    [
-                        _dec.Replace("▁", " "),
-                        _dec.ByteFallback(),
-                        _dec.Fuse(),
-                    ]
-                )
-
     def add_request(self, request: InferenceRequest):
         """Add a request to the scheduler."""
         self.scheduler.add_request(request)
@@ -341,7 +316,8 @@ def _check_request_finished(self, req: InferenceRequest, token_id: int) -> bool:
                 req.finish_reason = FinishReason.EOS_TOKEN
                 return True
 
-            # While ignoring EOS, stop strings are also ignored to avoid requiring additional arguments for benchmarking.
+            # While ignoring EOS, stop strings are also ignored to avoid requiring
+            # additional arguments for benchmarking.
             # Check stop strings
             # Remove stop string from generated_text if STOP_STRING is the finishing reason
             stop_strings = req.sampling_params.stop or []
@@ -354,11 +330,11 @@ def _check_request_finished(self, req: InferenceRequest, token_id: int) -> bool:
         return False
 
     def tokenize(self, text: str) -> List[int]:
-        """Tokenize text to token IDs."""
+        """Tokenize text to token IDs using InfiniLMTokenizer."""
         return self.tokenizer.encode(text)
 
     def detokenize(self, token_ids: List[int]) -> str:
-        """Detokenize token IDs to text."""
+        """Detokenize token IDs to text using InfiniLMTokenizer."""
         return self.tokenizer.decode(token_ids)
 
     def apply_chat_template(
@@ -367,7 +343,7 @@ def apply_chat_template(
         add_generation_prompt: bool = True,
         chat_template_kwargs: Optional[dict] = None,
     ) -> str:
-        """Apply chat template to messages."""
+        """Apply chat template to messages using InfiniLMTokenizer."""
         chat_template_kwargs = chat_template_kwargs or {}
         return self.tokenizer.apply_chat_template(
             conversation=messages,
diff --git a/python/infinilm/tokenizer_utils.py b/python/infinilm/tokenizer_utils.py
new file mode 100644
index 00000000..ad36c8b9
--- /dev/null
+++ b/python/infinilm/tokenizer_utils.py
@@ -0,0 +1,336 @@
+"""
+Tokenizer utilities for InfiniLM.
+
+This module provides InfiniLMTokenizer class that encapsulates all tokenizer
+operations including initialization, encoding/decoding, chat template handling,
+and model-specific fixes.
+"""
+
+import os
+import json
+from typing import List, Optional, Union, Any
+
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from tokenizers import decoders as _dec
+
+
+class InfiniLMTokenizer:
+    """Unified tokenizer wrapper for InfiniLM.
+
+    This class encapsulates all tokenizer-related operations including:
+    - Model-specific initialization and fixes
+    - Encoding/decoding
+    - Chat template application
+    - Pad token configuration
+
+    Attributes:
+        tokenizer: The underlying HuggingFace tokenizer instance.
+        model_type: The model type string (e.g., 'llama', 'qwen2', 'minicpm').
+        eos_token_id: End-of-sequence token ID(s).
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        trust_remote_code: Optional[bool] = None,
+        **kwargs: Any,
+    ):
+        """Initialize the tokenizer for a given model.
+
+        Args:
+            model_path: Path to the model directory containing config.json and tokenizer files.
+            trust_remote_code: Whether to trust remote code.
+                If None, will be determined based on model type.
+            **kwargs: Additional keyword arguments passed to AutoTokenizer.from_pretrained().
+
+        Raises:
+            FileNotFoundError: If config.json is not found in model_path.
+            ValueError: If tokenizer initialization fails.
+        """
+        self._model_path = os.path.expanduser(model_path)
+
+        # Load model config to determine model type
+        config_path = os.path.join(self._model_path, "config.json")
+        if not os.path.isfile(config_path):
+            raise FileNotFoundError(f"config.json not found in {self._model_path}")
+
+        with open(config_path, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+
+        self.model_type = config_dict.get("model_type", "").lower()
+
+        # Determine trust_remote_code based on model type if not explicitly specified
+        if trust_remote_code is None:
+            trust_remote_code = self._should_trust_remote_code(self.model_type)
+
+        # Initialize the underlying tokenizer
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self._model_path,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Failed to initialize tokenizer from {self._model_path}: {e}"
+            )
+
+        # Apply model-specific fixes
+        self._apply_model_specific_fixes()
+
+        # Configure pad token
+        self._configure_pad_token()
+
+        # Extract EOS token ID from config
+        eos_token_id = config_dict.get("eos_token_id")
+        if eos_token_id is not None:
+            self.eos_token_id = (
+                [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
+            )
+        else:
+            self.eos_token_id = []
+
+        # Ensure EOS token ID is always a list
+        if isinstance(self.eos_token_id, int):
+            self.eos_token_id = [self.eos_token_id]
+
+    @staticmethod
+    def _should_trust_remote_code(model_type: str) -> bool:
+        """Determine if trust_remote_code should be True based on model type.
+
+        Some model types require custom tokenizer code (trust_remote_code=True),
+        while others work with standard tokenizers.
+
+        Args:
+            model_type: The model type string.
+
+        Returns:
+            True if trust_remote_code should be enabled, False otherwise.
+        """
+        # Models that typically require custom code
+        require_trust = {
+            "fm9g",
+            "minicpm",
+            "fm9g7b",
+            "minicpmv",
+            "qwen2",
+            "qwen3",
+        }
+
+        # Models that work with standard tokenizers (no trust_remote_code needed)
+        standard_models = {
+            "llama",
+            "mistral",
+            "gemma",
+        }
+
+        if model_type in require_trust:
+            return True
+        elif model_type in standard_models:
+            return False
+        else:
+            # Default to True for unknown model types
+            return True
+
+    def _apply_model_specific_fixes(self) -> None:
+        """Apply model-specific tokenizer fixes.
+
+        Currently handles:
+        - Llama models: Fix decoder to handle space replacement properly.
+        """
+        if self.model_type == "llama":
+            self._fix_llama_decoder()
+
+    def _fix_llama_decoder(self) -> None:
+        """Fix Llama tokenizer decoder for proper space handling.
+
+        Llama tokenizers often have a decoder that prepends spaces, causing
+        double spaces or incorrect spacing in decoded text. This fix replaces
+        the decoder with one that handles spaces correctly.
+        """
+        backend = getattr(self.tokenizer, "backend_tokenizer", None)
+        target = getattr(backend, "_tokenizer", backend) if backend else None
+
+        if target is None:
+            return
+
+        norm = getattr(target, "normalizer", None)
+        dec = getattr(target, "decoder", None)
+
+        sn = repr(norm)[:800] if norm is not None else ""
+        sd = repr(dec)[:800] if dec is not None else ""
+
+        has_prepend = "Prepend" in sn
+        has_strip = "Strip" in sd
+
+        if has_prepend and has_strip:
+            target.decoder = _dec.Sequence(
+                [
+                    _dec.Replace("▁", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ]
+            )
+
+    def _configure_pad_token(self) -> None:
+        """Configure pad token and pad_token_id for the tokenizer.
+
+        If no pad_token is set, this method tries to use eos_token as pad_token.
+        If that fails, it adds a new [PAD] token.
+        """
+        if self.tokenizer.pad_token is not None:
+            return
+
+        if self.tokenizer.eos_token is not None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            # Add a new pad token
+            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    def encode(self, text: Union[str, List[str]]) -> List[int]:
+        """Encode text(s) into token IDs.
+
+        Args:
+            text: A single text string or a list of text strings.
+
+        Returns:
+            If input is a single string, returns a list of token IDs.
+            If input is a list of strings, returns a list of lists of token IDs.
+
+        Raises:
+            TypeError: If text is not a string or list of strings.
+        """
+        if isinstance(text, str):
+            return self.tokenizer.encode(text)
+        elif isinstance(text, list):
+            return [self.tokenizer.encode(t) for t in text]
+        else:
+            raise TypeError(f"Expected str or List[str], got {type(text).__name__}")
+
+    def decode(
+        self,
+        token_ids: Union[List[int], Any],
+        skip_special_tokens: bool = True,
+        **kwargs: Any,
+    ) -> str:
+        """Decode token IDs to text.
+
+        Args:
+            token_ids: List of token IDs to decode, or any iterable of integers.
+            skip_special_tokens: Whether to skip special tokens in decoding.
+            **kwargs: Additional keyword arguments passed to tokenizer.decode().
+
+        Returns:
+            Decoded text string.
+        """
+        # Convert to list if necessary (e.g., from numpy array or tensor)
+        if not isinstance(token_ids, list):
+            try:
+                token_ids = list(token_ids)
+            except TypeError:
+                token_ids = [token_ids]
+
+        return self.tokenizer.decode(
+            token_ids, skip_special_tokens=skip_special_tokens, **kwargs
+        )
+
+    def apply_chat_template(
+        self,
+        conversation: List[dict],
+        add_generation_prompt: bool = True,
+        tokenize: bool = False,
+        **kwargs: Any,
+    ) -> Union[str, List[int]]:
+        """Apply chat template to a conversation.
+
+        Args:
+            conversation: List of message dicts with 'role' and 'content' keys.
+                Example: [{"role": "user", "content": "Hello"}]
+            add_generation_prompt: Whether to add generation prompt at the end.
+            tokenize: Whether to tokenize the output.
+            **kwargs: Additional keyword arguments passed to tokenizer.apply_chat_template().
+
+        Returns:
+            Formatted conversation string, or list of token IDs if tokenize=True.
+
+        Raises:
+            ValueError: If the tokenizer does not have a chat template and tokenize=False.
+        """
+        if (
+            hasattr(self.tokenizer, "chat_template")
+            and self.tokenizer.chat_template is not None
+        ):
+            return self.tokenizer.apply_chat_template(
+                conversation=conversation,
+                add_generation_prompt=add_generation_prompt,
+                tokenize=tokenize,
+                **kwargs,
+            )
+        else:
+            # Fallback: construct a simple prompt from the conversation
+            text_parts = []
+            for msg in conversation:
+                role = msg.get("role", "user")
+                content = msg.get("content", "")
+                if role == "system":
+                    text_parts.append(f"System: {content}")
+                elif role == "user":
+                    text_parts.append(f"User: {content}")
+                elif role == "assistant":
+                    text_parts.append(f"Assistant: {content}")
+                else:
+                    text_parts.append(f"{role}: {content}")
+
+            text = "\n".join(text_parts)
+            if add_generation_prompt:
+                text += "\nAssistant: "
+
+            if tokenize:
+                return self.encode(text)
+            return text
+
+    @property
+    def model_max_length(self) -> int:
+        """Get the maximum sequence length supported by the model."""
+        return getattr(self.tokenizer, "model_max_length", 2048)
+
+    @property
+    def vocab_size(self) -> int:
+        """Get the vocabulary size."""
+        return self.tokenizer.vocab_size
+
+    @property
+    def pad_token_id(self) -> Optional[int]:
+        """Get the pad token ID."""
+        return self.tokenizer.pad_token_id
+
+    @property
+    def eos_token(self) -> Optional[str]:
+        """Get the EOS token string."""
+        return self.tokenizer.eos_token
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        """Get the underlying HuggingFace tokenizer instance.
+
+        Returns:
+            The PreTrainedTokenizer or PreTrainedTokenizerFast instance.
+        """
+        return self.tokenizer
+
+    def __repr__(self) -> str:
+        return (
+            f"InfiniLMTokenizer(model_type='{self.model_type}', "
+            f"vocab_size={self.vocab_size}, "
+            f"model_max_length={self.model_max_length})"
+        )
+
+
+# Legacy function for backward compatibility
+def infinilm_encode(tokenizer, text):
+    """Encode text into token ids using the provided tokenizer.
+
+    Deprecated: Use InfiniLMTokenizer.encode() instead.
+    This function is kept for backward compatibility.
+    """
+    return tokenizer.encode(text)
diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py
index a3adf487..ed8a0982 100644
--- a/test/bench/test_benchmark.py
+++ b/test/bench/test_benchmark.py
@@ -9,40 +9,44 @@
 from datasets import load_dataset, Dataset
 from abc import ABC, abstractmethod
 from infinilm.base_config import BaseConfig
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 
 TOTAL_TOKENS = 0
 TOTAL_TIME = 0.0
 
 
 class BaseBenchmark(ABC):
-    """Base class for benchmark evaluation with common tokenizer and generation utilities"""
+    """Base class for benchmark evaluation with common tokenizer and generation utilities."""
+
+    def __init__(self):
+        self.tokenizer: InfiniLMTokenizer = None
 
     def encode_text(self, text):
-        """Encode text to token IDs - reused across backends"""
+        """Encode text to token IDs using InfiniLMTokenizer."""
         return self.tokenizer.encode(text)
 
-    def decode_token(self, token_id):
-        """Decode token ID to text - reused across backends"""
-        return self.tokenizer.decode(token_id)
+    def decode_token(self, token_ids):
+        """Decode token IDs to text using InfiniLMTokenizer."""
+        return self.tokenizer.decode(token_ids)
 
     @abstractmethod
     def render_input_content(self, *args, **kwargs):
-        """Render input content - benchmark-specific implementation"""
+        """Render input content - benchmark-specific implementation."""
         pass
 
     @abstractmethod
     def generate(self, *args, **kwargs):
-        """Generate response - benchmark-specific implementation"""
+        """Generate response - benchmark-specific implementation."""
         pass
 
     @abstractmethod
     def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
-        """Backend-specific generation implementation"""
+        """Backend-specific generation implementation."""
         pass
 
 
 class InfiniLMBenchmark(BaseBenchmark):
-    """Wrapper class for InfiniLM cpp backend for benchmark evaluation"""
+    """Wrapper class for InfiniLM cpp backend for benchmark evaluation."""
 
     def __init__(
         self,
@@ -55,7 +59,6 @@ def __init__(
         enable_graph=False,
         attn_backend="default",
     ):
-        import transformers
         import infinicore
         from infinilm.modeling_utils import load_model_state_dict_by_file
         from infinilm.distributed import DistConfig
@@ -86,38 +89,15 @@ def __init__(
         # So device index 0 will automatically map to the first visible device
         self.device = infinicore.device(device_name, 0)
 
-        # Load config and tokenizer
+        # Initialize tokenizer using InfiniLMTokenizer
+        self.tokenizer = InfiniLMTokenizer(model_dir_path)
+
+        # Load config for model parameters
         with open(os.path.join(model_dir_path, "config.json"), "r") as f:
             self.config_dict = json.load(f)
 
-        # Align tokenizer initialization with jiuge backend (010)
-        # Match the exact same initialization logic based on model type
-        model_type = self.config_dict.get("model_type", "")
-        if model_type == "llama":
-            # For llama models: no trust_remote_code (matches jiuge line 465)
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_dir_path, trust_remote_code=True
-            )
-        elif model_type in ["fm9g", "minicpm", "fm9g7b"]:
-            # For fm9g/minicpm/fm9g7b models: use trust_remote_code=True (matches jiuge lines 493-495, 518-520)
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_dir_path, trust_remote_code=True
-            )
-        elif model_type in ["qwen2", "qwen3"]:
-            # For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_dir_path, trust_remote_code=True
-            )
-        else:
-            # Default: use trust_remote_code=True for other models
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_dir_path, trust_remote_code=True
-            )
-
-        eos_token_id = self.config_dict.get("eos_token_id")
-        self.eos_token_id = (
-            [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
-        )
+        # Get EOS token IDs from tokenizer manager
+        self.eos_token_id = self.tokenizer.eos_token_id
 
         if backend != "cpp":
             raise ValueError(f"Unsupported backend: {backend}.")
@@ -154,10 +134,11 @@ def __init__(
         print("Model loaded successfully")
 
     def max_context_len(self):
+        """Get maximum context length from model config."""
         return self.config_dict.get("max_position_embeddings", 2048)
 
     def render_input_content(self, *args, **kwargs):
-        """Render input content based on benchmark type"""
+        """Render input content based on benchmark type."""
         if self.benchmark == "ceval":
             return render_ceval(self.tokenizer, *args, **kwargs)
         elif self.benchmark == "mmlu":
@@ -166,12 +147,12 @@ def render_input_content(self, *args, **kwargs):
             raise ValueError(f"Unknown benchmark: {self.benchmark}")
 
     def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
-        """Generate response based on benchmark type"""
+        """Generate response based on benchmark type."""
         # Render input content
         input_content = self.render_input_content(*args)
         print(input_content, end="", flush=True)
 
-        # Encode input
+        # Encode input using InfiniLMTokenizer
         tokens = self.encode_text(input_content)
 
         # Delegate to backend-specific generation implementation
@@ -183,7 +164,7 @@ def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
 
     def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
         """
-        InfiniLM cpp backend-specific generation implementation
+        InfiniLM cpp backend-specific generation implementation.
 
         NOTE: Validation confirmed input configs are identical between backends.
         The issue was that manual generation loop called InferEngine.generate() which
@@ -227,6 +208,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
 
         # ---- post process ----
         generated_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids])
+        # Use InfiniLMTokenizer for decoding
         output_text = self.tokenizer.decode(generated_ids)
 
         # ---- stats ----
@@ -250,13 +232,13 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
         return output_text
 
     def destroy_model_instance(self):
-        # Cleanup if needed
+        """Cleanup model resources."""
         del self.model
         print("Model destroyed")
 
 
 class TorchBenchmark(BaseBenchmark):
-    """Torch backend using HuggingFace Transformers"""
+    """Torch backend using HuggingFace Transformers."""
 
     def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"):
         import torch
@@ -264,7 +246,7 @@ def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"):
 
         self.benchmark = benchmark
 
-        # Device
+        # Device setup
         if device_type_str == "nvidia":
             self.device = torch.device("cuda")
         elif device_type_str == "cpu":
@@ -307,9 +289,11 @@ def __init__(self, model_dir_path, device_type_str="cpu", benchmark="ceval"):
         )
 
     def max_context_len(self):
+        """Get maximum context length from model config."""
         return self.config_dict.get("max_position_embeddings", 2048)
 
     def render_input_content(self, *args, **kwargs):
+        """Render input content based on benchmark type."""
         if self.benchmark == "ceval":
             return render_ceval(self.tokenizer, *args, **kwargs)
         elif self.benchmark == "mmlu":
@@ -318,6 +302,7 @@ def render_input_content(self, *args, **kwargs):
             raise ValueError(f"Unknown benchmark: {self.benchmark}")
 
     def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
+        """Torch backend-specific generation implementation."""
         import torch
         import time
 
@@ -339,7 +324,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
             pad_token_id=2,
         )
 
-        # --- end sync ---
+        # Sync for accurate timing
         if self.device.type == "cuda":
             torch.cuda.synchronize()
 
@@ -370,20 +355,23 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
         return output_text
 
     def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
+        """Generate response based on benchmark type."""
         input_content = self.render_input_content(*args)
         print(input_content, end="", flush=True)
 
+        # Encode input using InfiniLMTokenizer
         tokens = self.encode_text(input_content)
 
         return self._generate_step(tokens, max_steps, topp_, topk_, temperature_)
 
     def destroy_model_instance(self):
+        """Cleanup model resources."""
         del self.model
         print("Torch model destroyed")
 
 
 class VLLMBenchmark(BaseBenchmark):
-    """vLLM backend using vllm.LLM"""
+    """vLLM backend using vllm.LLM."""
 
     def __init__(
         self,
@@ -429,9 +417,11 @@ def __init__(
         print("vLLM model loaded successfully")
 
     def max_context_len(self):
+        """Get maximum context length from model config."""
         return self.config_dict.get("max_position_embeddings", 2048)
 
     def render_input_content(self, *args, **kwargs):
+        """Render input content based on benchmark type."""
         if self.benchmark == "ceval":
             return render_ceval(self.tokenizer, *args, **kwargs)
         elif self.benchmark == "mmlu":
@@ -440,15 +430,19 @@ def render_input_content(self, *args, **kwargs):
             raise ValueError(f"Unknown benchmark: {self.benchmark}")
 
     def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
+        """Generate response based on benchmark type."""
         input_content = self.render_input_content(*args)
         print(input_content, end="", flush=True)
 
+        # Encode input using InfiniLMTokenizer
         tokens = self.encode_text(input_content)
         return self._generate_step(tokens, max_steps, topp_, topk_, temperature_)
 
     def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
+        """vLLM backend-specific generation implementation."""
         from vllm import SamplingParams
 
+        # Decode prompt from tokens
         prompt = self.tokenizer.decode(tokens)
 
         sampling_params = SamplingParams(
@@ -473,6 +467,7 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
 
         # ---- stats ----
         input_tokens = len(tokens)
+        # Use InfiniLMTokenizer for encoding to count tokens
         new_tokens = len(self.encode_text(output_text))
         total_tokens = input_tokens + new_tokens
 
@@ -494,12 +489,13 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
         return output_text
 
     def destroy_model_instance(self):
+        """Cleanup model resources."""
         del self.llm
         print("vLLM model destroyed")
 
 
 def render_ceval(_tokenizer, conversation):
-    """Render C-Eval conversation to input content"""
+    """Render C-Eval conversation to input content."""
     return (
         _tokenizer.apply_chat_template(
             conversation=conversation,
@@ -511,7 +507,7 @@ def render_ceval(_tokenizer, conversation):
 
 
 def render_mmlu(_tokenizer, question, choices):
-    """Render MMLU question and choices to input content"""
+    """Render MMLU question and choices to input content."""
     choices_text = "\n".join(
         [f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)]
     )
@@ -542,7 +538,7 @@ def render_mmlu(_tokenizer, question, choices):
 
 
 def extract_answer_ceval(output_content, answer):
-    """Extract predicted answer from C-Eval output"""
+    """Extract predicted answer from C-Eval output."""
     output_upper = output_content.upper().strip()
     position = 0
     ABCD = output_upper[position : position + 2]
@@ -550,7 +546,7 @@ def extract_answer_ceval(output_content, answer):
 
 
 def extract_answer_mmlu(output_content):
-    """Extract predicted answer from MMLU output (returns 0-3 index or None)"""
+    """Extract predicted answer from MMLU output (returns 0-3 index or None)."""
     output_upper = output_content.upper().strip()
 
     # Find first meaningful token
@@ -565,7 +561,7 @@ def extract_answer_mmlu(output_content):
 
 
 def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=None):
-    """Evaluate samples for a single subject and return results"""
+    """Evaluate samples for a single subject and return results."""
     answers_list = []
     for idx, sample in enumerate(samples):
         if benchmark == "ceval":
@@ -640,7 +636,7 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
     true_num = 0
     all_num = 0
     for cont in answers_list:
-        id = cont["id"]
+        idx = cont["id"]
         all_num = all_num + 1
 
         if benchmark == "ceval":
@@ -648,16 +644,16 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
             is_correct = cont["is_correct"]
             if is_correct:
                 true_num = true_num + 1
-                print(f"id {id} : ", "正确")
+                print(f"id {idx} : ", "正确")
             else:
-                print(f"id {id}: ", "错误")
+                print(f"id {idx}: ", "错误")
 
         elif benchmark == "mmlu":
             answer = cont["answer"]
             predicted = cont["predicted"]
             if predicted is not None and predicted == answer:
                 true_num = true_num + 1
-                print(f"id {id}: Correct")
+                print(f"id {idx}: Correct")
             else:
                 answer_letter = chr(65 + answer) if answer < 4 else "?"
                 predicted_letter = (
@@ -666,7 +662,7 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
                     else "?"
                 )
                 print(
-                    f"id {id}: Wrong (correct: {answer_letter}, predicted: {predicted_letter})"
+                    f"id {idx}: Wrong (correct: {answer_letter}, predicted: {predicted_letter})"
                 )
 
     accuracy = true_num / all_num if all_num > 0 else 0.0
@@ -900,7 +896,7 @@ def _load_ceval_subject(subj):
                 return _load_ceval_from_cache(
                     args.cache_dir, subj, args.split, ceval_subjects
                 )
-            # online fallback via HF load_dataset
+            # Online fallback via HF load_dataset
             if args.split == "all":
                 records = []
                 for split_name in ["val", "test"]:
@@ -1104,7 +1100,7 @@ def load_subject_samples(subj_name):
 
 
 def main():
-    """Main function"""
+    """Main function."""
     cfg = BaseConfig()
 
     device_type_str = cfg.device