[update] add internvl3-1B-ax630c model update main_vlm

LittleMouse · LittleMouse · commit 92b10acf3ebd · 2025-08-29T16:22:35.000+08:00
diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_internvl3-1B-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_internvl3-1B-ax630c.py
@@ -0,0 +1,241 @@
+from transformers import AutoTokenizer
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+import argparse
+import uuid
+
+tokenizers = {}
+
+class Tokenizer_Http():
+    def __init__(self, model_id):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.messages = [
+            {"role": "system", "content": "你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。"},
+        ]
+        self.token_ids = []
+        self.token_ids_cache = []
+
+    def encode(self, prompt, last_reply=None):
+        if last_reply is not None:
+            self.messages.append({"role": "assistant", "content": last_reply})
+            text = self.tokenizer.apply_chat_template(
+                self.messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            self.token_ids = self.tokenizer.encode(text)[:-3]
+        self.messages.append({"role": "user", "content": prompt})
+
+        text = self.tokenizer.apply_chat_template(
+            self.messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        token_ids = self.tokenizer.encode(text)
+        diff = token_ids[len(self.token_ids):]
+        self.token_ids = token_ids
+        return token_ids, diff
+
+    def encode_with_image(self, question: str, num_of_images: int, imgsz: int, last_reply=None):
+        if last_reply is not None:
+            self.messages.append({"role": "assistant", "content": last_reply})
+
+        # 根据图片尺寸设定 context_len
+        if imgsz == 448:
+            context_len = 256
+        elif imgsz == 224:
+            context_len = 64
+        else:
+            print(f"Unsupported imgsz: {imgsz}")
+            return None, None
+
+        # 拼接带图片的用户输入
+        question_with_images = question
+        if num_of_images > 0:
+            for _ in range(num_of_images):
+                question_with_images += "\n<img>" + "<IMG_CONTEXT>" * context_len + "</img>\n"
+
+        self.messages.append({"role": "user", "content": question_with_images})
+
+        text = self.tokenizer.apply_chat_template(
+            self.messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        token_ids = self.tokenizer.encode(text)
+        diff = token_ids[len(self.token_ids):]
+        self.token_ids = token_ids
+        return token_ids, diff
+
+    def decode(self, token_ids):
+        self.token_ids_cache += token_ids
+        text = self.tokenizer.decode(self.token_ids_cache)
+        if "\ufffd" in text:
+            print("Text 中包含非法字符")
+            return ""
+        else:
+            self.token_ids_cache.clear()
+            return text
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+    @property
+    def bos_token(self):
+        return self.tokenizer.bos_token
+    @property
+    def eos_token(self):
+        return self.tokenizer.eos_token
+    @property
+    def img_start_token(self):
+        return self.tokenizer.encode("<img>")[0]
+    @property
+    def img_context_token(self):
+        return self.tokenizer.encode("<IMG_CONTEXT>")[0]
+
+    def reset(self, system_prompt=None):
+        if system_prompt is None:
+            system_prompt = args.content
+        self.messages = [
+            {"role": "system", "content": system_prompt},
+        ]
+        text = self.tokenizer.apply_chat_template(
+            self.messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        token_ids = self.tokenizer.encode(text)[:-3]
+        self.token_ids = token_ids
+        print(self.decode(token_ids))
+        return token_ids
+
+
+class Request(BaseHTTPRequestHandler):
+    timeout = 5
+    server_version = 'Apache'
+
+    def do_GET(self):
+        print(self.path)
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.end_headers()
+        if '/get_uid' in self.path:
+            new_uid = str(uuid.uuid4())
+            print("新 uid:", new_uid)
+            tokenizers[new_uid] = Tokenizer_Http(args.model_id)
+            msg = json.dumps({'uid': new_uid})
+        elif '/bos_id' in self.path:
+            uid = self.get_query_param("uid")
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                msg = json.dumps({'bos_id': instance.bos_id if instance.bos_id is not None else -1})
+        elif '/eos_id' in self.path:
+            uid = self.get_query_param("uid")
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                msg = json.dumps({'eos_id': instance.eos_id if instance.eos_id is not None else -1})
+        elif '/img_start_token' in self.path:
+            uid = self.get_query_param("uid")
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                msg = json.dumps({'img_start_token': instance.img_start_token})
+
+        elif '/img_context_token' in self.path:
+            uid = self.get_query_param("uid")
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                msg = json.dumps({'img_context_token': instance.img_context_token})
+        else:
+            msg = json.dumps({'error': 'Invalid GET endpoint'})
+        print(msg)
+        self.wfile.write(msg.encode())
+
+    def do_POST(self):
+        content_length = int(self.headers.get('content-length', 0))
+        data = self.rfile.read(content_length).decode()
+        req = json.loads(data)
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.end_headers()
+
+        if '/encode' in self.path:
+            uid = req.get('uid')
+            prompt = req.get('text')
+            last_reply = req.get('last_reply')
+            b_img_prompt = False
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if 'img_prompt' in req:
+                b_img_prompt = req['img_prompt']
+            if b_img_prompt:
+                num_img = req['num_img']
+                imgsz = req['imgsz']
+
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                if b_img_prompt:
+                    token_ids, diff = instance.encode_with_image(prompt, num_img, imgsz, last_reply)
+                else:
+                    token_ids, diff = instance.encode(prompt, last_reply)
+                msg = json.dumps({'token_ids': token_ids, 'diff': diff})
+
+        elif '/decode' in self.path:
+            uid = req.get('uid')
+            token_ids = req.get('token_ids')
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                text = instance.decode(token_ids)
+                msg = json.dumps({'text': text})
+
+        elif '/reset' in self.path:
+            uid = req.get("uid")
+            system_prompt = req.get("system_prompt")
+            instance: Tokenizer_Http = tokenizers.get(uid)
+            if instance is None:
+                msg = json.dumps({'error': 'Invalid uid'})
+            else:
+                if system_prompt is not None:
+                    print("system_prompt:", system_prompt)
+                    token_ids = instance.reset(system_prompt)
+                else:
+                    token_ids = instance.reset()
+                msg = json.dumps({'token_ids': token_ids})
+
+        else:
+            msg = json.dumps({'error': 'Invalid POST endpoint'})
+
+        self.wfile.write(msg.encode())
+
+    def get_query_param(self, key):
+        from urllib.parse import urlparse, parse_qs
+        query = urlparse(self.path).query
+        params = parse_qs(query)
+        values = params.get(key)
+        return values[0] if values else None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8080)
+    parser.add_argument('--model_id', type=str, default='internvl3_tokenizer')
+    parser.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。')
+    args = parser.parse_args()
+
+    host = (args.host, args.port)
+    print("http://%s:%s" % host)
+    server = HTTPServer(host, Request)
+    server.serve_forever()
diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp
@@ -135,6 +135,7 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], filename_tokens_embed);
             CONFIG_AUTO_SET(file_body["mode_param"], filename_post_axmodel);
             CONFIG_AUTO_SET(file_body["mode_param"], filename_vpm_resampler_axmodedl);
+            CONFIG_AUTO_SET(file_body["mode_param"], filename_image_encoder_axmodedl);
             CONFIG_AUTO_SET(file_body["mode_param"], template_filename_axmodel);
             CONFIG_AUTO_SET(file_body["mode_param"], b_use_topk);
             CONFIG_AUTO_SET(file_body["mode_param"], b_vpm_two_stage);
@@ -218,6 +219,7 @@ class llm_task {
             mode_config_.filename_post_axmodel           = base_model + mode_config_.filename_post_axmodel;
             mode_config_.template_filename_axmodel       = base_model + mode_config_.template_filename_axmodel;
             mode_config_.filename_vpm_resampler_axmodedl = base_model + mode_config_.filename_vpm_resampler_axmodedl;
+            mode_config_.filename_image_encoder_axmodedl = base_model + mode_config_.filename_image_encoder_axmodedl;
             mode_config_.runing_callback = [this](int *p_token, int n_token, const char *p_str, float token_per_sec,
                                                   void *reserve) {
                 if (this->out_callback_) {
@@ -341,36 +343,39 @@ class llm_task {
 
             if (lLaMa_ctx_) {
                 if (image_data_.empty()) {
-                    lLaMa_ctx_->Encode(prompt_data_, prompt_complete(prompt_), last_reply, tokens_ids, tokens_diff);
+                    lLaMa_ctx_->Encode(prompt_data_, prompt_complete(msg), last_reply, tokens_ids, tokens_diff);
                     if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
                         ret != 0) {
                         ALOGE("SetKVCache failed: %d,the context may be full,input \"reset\" to reset context", ret);
                         return;
                     }
                     last_reply = lLaMa_ctx_->Run(prompt_data_);
                     lLaMa_ctx_->GetKVCache(k_caches, v_caches, precompute_len);
+                    if (out_callback_) out_callback_(last_reply, true);
+                } else {
+                    cv::Mat src = cv::imdecode(image_data_, cv::IMREAD_COLOR);
+                    if (src.empty()) return;
+                    image_data_.clear();
+                    std::vector<unsigned short> img_embed;
+                    if (auto ret = lLaMa_ctx_->Encode(src, img_embed); ret != 0) {
+                        ALOGE("lLaMaCtx.Encode failed");
+                        return;
+                    }
+                    if (auto ret =
+                            lLaMa_ctx_->Encode(img_embed, prompt_data_, prompt_complete(msg), tokens_ids, tokens_diff);
+                        ret != 0) {
+                        ALOGE("lLaMaCtx.Encode failed");
+                        return;
+                    }
+                    if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
+                        ret != 0) {
+                        ALOGE("SetKVCache failed: %d,the context may be full,input \"reset\" to reset context", ret);
+                        return;
+                    }
+                    last_reply = lLaMa_ctx_->Run(prompt_data_);
+                    lLaMa_ctx_->GetKVCache(k_caches, v_caches, precompute_len);
+                    if (out_callback_) out_callback_(last_reply, true);
                 }
-                cv::Mat src = cv::imdecode(image_data_, cv::IMREAD_COLOR);
-                if (src.empty()) return;
-                image_data_.clear();
-                std::vector<unsigned short> img_embed;
-                if (auto ret = lLaMa_ctx_->Encode(src, img_embed); ret != 0) {
-                    ALOGE("lLaMaCtx.Encode failed");
-                    return;
-                }
-                if (auto ret =
-                        lLaMa_ctx_->Encode(img_embed, prompt_data_, prompt_complete(prompt_), tokens_ids, tokens_diff);
-                    ret != 0) {
-                    ALOGE("lLaMaCtx.Encode failed");
-                    return;
-                }
-                if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
-                    ret != 0) {
-                    ALOGE("SetKVCache failed: %d,the context may be full,input \"reset\" to reset context", ret);
-                    return;
-                }
-                last_reply = lLaMa_ctx_->Run(prompt_data_);
-                lLaMa_ctx_->GetKVCache(k_caches, v_caches, precompute_len);
             }
         } catch (...) {
             SLOGW("lLaMa_->Run have error!");
diff --git a/projects/llm_framework/main_vlm/src/runner/LLM.hpp b/projects/llm_framework/main_vlm/src/runner/LLM.hpp
@@ -469,20 +469,8 @@ class LLM {
             if (_attr.b_dynamic_load_axmodel_layer) {
                 layer.layer.deinit();
             }
-            // ALOGI("%f %f %f %f %f", bfloat16(embed[0]).fp32(), bfloat16(embed[1]).fp32(), bfloat16(embed[2]).fp32(),
-            // bfloat16(embed[3]).fp32(), bfloat16(embed[4]).fp32());
         }
 
-        // ALOGI("prefill time cost: %.2f s", t_cost.cost() / 1000);
-
-        // print token_ids
-        // printf("%s\n", input_str.c_str());
-        // for (size_t i = 0; i < token_ids.size(); i++)
-        // {
-        //     printf("%d ", token_ids[i]);
-        // }
-        // printf("\n");
-
         int next_token = -1;
         t_cqdm cqdm    = create_cqdm(_attr.max_token_len, 32);
         std::vector<unsigned short> embed(_attr.tokens_embed_size, 0);
@@ -522,10 +510,7 @@ class LLM {
                 break;
             }
 
-            // ALOGI("out %d %d", indices, next_token);
             embed_selector.getByIndex(next_token, embed);
-            // ALOGI("%f %f %f %f %f", bfloat16(embed[0]).fp32(), bfloat16(embed[1]).fp32(), bfloat16(embed[2]).fp32(),
-            // bfloat16(embed[3]).fp32(), bfloat16(embed[4]).fp32());
 
             for (int m = 0; m < _attr.axmodel_num; m++) {
                 if (b_stop) {
@@ -580,10 +565,8 @@ class LLM {
                 if (_attr.b_dynamic_load_axmodel_layer) {
                     layer.layer.deinit();
                 }
-                // ALOGI("%f %f %f %f %f", bfloat16(embed[0]).fp32(), bfloat16(embed[1]).fp32(),
-                // bfloat16(embed[2]).fp32(), bfloat16(embed[3]).fp32(), bfloat16(embed[4]).fp32());
             }
-            // ALOGI("");
+
             mask[indices] = 0;
             {
                 // post process
@@ -644,9 +627,6 @@ class LLM {
         float t_cost_ms = t_cost.cost();
         ALOGN("hit eos,avg %.2f token/s\n", token_ids.size() / (t_cost_ms / 1000));
 
-        // 去掉 len_of_input 那部分
-        // token_ids.erase(token_ids.begin(), token_ids.begin() + len_of_input);
-
         final_out = tokenizer->Decode(token_ids);
 
         return final_out;
@@ -1358,23 +1338,6 @@ class LLM_CTX {
         return 0;
     }
 
-    int Encode(std::vector<unsigned short> &out_embed, std::string prompt = "What is in the image?")
-    {
-        ImageInfo img_info;
-        img_info.img_prompt        = false;
-        std::vector<int> input_ids = tokenizer->Encode(prompt, img_info);
-        if (input_ids.size() > _attr.prefill_token_num) {
-            ALOGE("input_ids(%ld) > prefill_token_num(%d)", input_ids.size(), _attr.prefill_token_num);
-            return -1;
-        }
-        out_embed.resize(input_ids.size() * _attr.tokens_embed_size);
-
-        for (size_t i = 0; i < input_ids.size(); i++) {
-            embed_selector.getByIndex(input_ids[i], out_embed.data() + i * _attr.tokens_embed_size);
-        }
-
-        return 0;
-    }
 
     int Encode(std::vector<std::vector<unsigned short>> &imgs_embed, std::vector<unsigned short> &out_embed,
                std::string prompt, std::vector<int> &tokens_ids, std::vector<int> &tokens_diff)