From 34d32a5761e532cfd018018ade7be03a8f8add2b Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 21 Nov 2024 16:52:01 +0800
Subject: [PATCH 1/9] update

---
 autotest/config-v100.yaml                     |  26 +-
 autotest/config.yaml                          |  13 +-
 .../test_pipeline_chat_pytorch_llm.py         |   2 -
 .../test_pipeline_chat_pytorch_mllm.py        |   4 -
 .../test_pipeline_chat_turbomind_llm.py       |   2 -
 .../test_pipeline_chat_turbomind_mllm.py      |   4 -
 .../test_restful_chat_hf_pytorch_llm.py       |   3 +-
 .../test_restful_chat_hf_pytorch_mllm.py      |   3 +-
 .../test_restful_chat_hf_turbomind_llm.py     |   3 +-
 .../test_restful_chat_hf_turbomind_mllm.py    |   3 +-
 autotest/utils/pipeline_chat.py               | 323 ++++++++++++++++++
 autotest/utils/run_restful_chat.py            |  15 +-
 docs/en/supported_models/supported_models.md  |   4 +-
 .../supported_models/supported_models.md      |   4 +-
 14 files changed, 375 insertions(+), 34 deletions(-)

diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index 41216cb730..adfca51ab8 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -96,16 +96,21 @@ pytorch_vl_model:
 
 turbomind_quatization:
     no_awq:
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - internlm/internlm-xcomposer2d5-7b
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
-        - THUDM/glm-4-9b-chat
+        - deepseek-ai/deepseek-coder-1.3b-instruct
+        - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -115,11 +120,21 @@ pytorch_quatization:
         - internlm/internlm2_5-20b-chat
         - Qwen/Qwen2-1.5B-Instruct
     w8a8:
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - meta-llama/Llama-2-7b-chat-hf
+        - internlm/internlm2-chat-20b
         - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-20b-chat
+        - 01-ai/Yi-6B-Chat
+        - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -128,7 +143,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6c92d2cf0b..2ba60e3ed2 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -163,15 +163,23 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
         - Qwen/Qwen2-VL-2B-Instruct
         - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
+        - mistralai/Mistral-Nemo-Instruct-2407
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -201,6 +209,10 @@ pytorch_quatization:
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -209,7 +221,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index a828e17a09..58674fa173 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -67,8 +67,6 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                               exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 276ced5bcb..8403ced94f 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 17560e754d..d1865175cf 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -56,8 +56,6 @@ def test_pipeline_chat_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_all_model_list(tp_num=1, quant_policy=4))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 8f1bc7d8b1..8c845fa77a 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index ab1f5595ae..fc95e288ca 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -67,8 +67,7 @@ def getKvintModelList(tp_num, quant_policy):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, exclude_dup=True)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, exclude_dup=True)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
index b210733db4..bf20c45e6e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 91e65ee51a..1c9131b32e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -66,8 +66,7 @@ def getKvintModelList(tp_num, quant_policy):
         'cuda_prefix': None,
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
-    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 091e18e6e3..641f2f760f 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_all_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 562a707efe..c09e46a42b 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -3,7 +3,10 @@
 from subprocess import PIPE
 
 import allure
+import numpy as np
 import torch
+from decord import VideoReader, cpu
+from PIL import Image
 from pytest_assume.plugin import assume
 from utils.get_run_config import get_model_name, get_tp_num
 from utils.rule_condition_assert import assert_result
@@ -13,6 +16,7 @@
 from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.utils import encode_image_base64
 
 
 def run_pipeline_chat_test(config,
@@ -275,6 +279,11 @@ def assert_pipeline_single_element(output,
 
 PIC1 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
 PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
+PIC_BEIJING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg'  # noqa E501
+PIC_CHONGQING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'  # noqa E501
+PIC_RACCON = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
+PIC_PANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'  # noqa E501
+DESC = 'What are the similarities and differences between these two images.'  # noqa E501
 
 
 def run_pipeline_vl_chat_test(config,
@@ -386,12 +395,326 @@ def run_pipeline_vl_chat_test(config,
                     ', reason: Multi-turn example: ski not in ' +
                     sess.response.text + '\n')
 
+    if 'internvl' in model_case.lower():
+        internvl_vl_testcase(config, pipe, file)
+    if 'llava' in model_case.lower():
+        llava_vl_testcase(config, pipe, file)
+    if 'minicpm' in model_case.lower():
+        MiniCPM_vl_testcase(config, pipe, file)
+    if 'qwen' in model_case.lower():
+        Qwen_vl_testcase(config, pipe, file)
+
     file.close()
 
     del pipe
     torch.cuda.empty_cache()
 
 
+def internvl_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(
+            role='user',
+            content=[
+                dict(
+                    type='text',
+                    text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{DESC}'  # noqa E251,E501
+                ),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_RACCON)),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+            ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    # multi-image multi-round conversation, separate images
+    messages = [
+        dict(
+            role='user',
+            content=[
+                dict(
+                    type='text',
+                    text=f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\n'
+                    +  # noqa E251,E501
+                    DESC),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_RACCON)),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+            ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    # video multi-round conversation
+    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+
+    def load_video(video_path, bound=None, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = get_index(bound,
+                                  fps,
+                                  max_frame,
+                                  first_idx=0,
+                                  num_segments=num_segments)
+        imgs = []
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            imgs.append(img)
+        return imgs
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/red-panda.mp4'
+    imgs = load_video(video_path, num_segments=8)
+
+    question = ''
+    for i in range(len(imgs)):
+        question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
+
+    question += 'What is the red panda doing?'
+
+    content = [{'type': 'text', 'text': question}]
+    for img in imgs:
+        content.append({
+            'type': 'image_url',
+            'image_url': {
+                'max_dynamic_patch': 1,
+                'url': f'data:image/jpeg;base64,{encode_image_base64(img)}'
+            }
+        })
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(
+        dict(role='user',
+             content='Describe this video in detail. Don\'t repeat.'))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+
+def llava_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+
+def MiniCPM_vl_testcase(config, pipe, file):
+    # Chat with multiple images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_RACCON)),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_PANDA))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    # In-context few-shot learning
+    question = 'production date'
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url='example1.jpg')),
+             ]),
+        dict(role='assistant', content='2023.08.04'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url='example2.jpg')),
+             ]),
+        dict(role='assistant', content='2007.04.24'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url='test.jpg')),
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    # Chat with video
+    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
+
+    def encode_video(video_path):
+
+        def uniform_sample(length, n):
+            gap = len(length) / n
+            idxs = [int(i * gap + gap / 2) for i in range(n)]
+            return [length[i] for i in idxs]
+
+        vr = VideoReader(video_path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+        if len(frame_idx) > MAX_NUM_FRAMES:
+            frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+        frames = vr.get_batch(frame_idx).asnumpy()
+        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+        print('num frames:', len(frames))
+        return frames
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/video_test.mp4'
+    frames = encode_video(video_path)
+    question = 'Describe the video'
+
+    content = [dict(type='text', text=question)]
+    for frame in frames:
+        content.append(
+            dict(type='image_url',
+                 image_url=dict(
+                     use_image_id=False,
+                     max_slice_nums=2,
+                     url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'
+                 )))
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+
+def Qwen_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    # image resolution for performance boost
+    min_pixels = 64 * 28 * 28
+    max_pixels = 64 * 28 * 28
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_BEIJING)),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: Multi-turn example: ski not in ' +
+                    response.text + '\n')
+
+
 def assert_pipeline_vl_chat_log(config, model_case, worker_id):
     log_path = config.get('log_path')
 
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 77af1975be..082a61bcda 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -282,6 +282,7 @@ def get_model(url):
 
 
 PIC = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 
 
 def run_vl_testcase(config, port: int = DEFAULT_PORT):
@@ -307,6 +308,11 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
             'image_url': {
                 'url': PIC,
             },
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': PIC2,
+            },
         }],
     }]
 
@@ -315,8 +321,6 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                               temperature=0.8,
                                               top_p=0.8)
     file.writelines(str(response).lower() + '\n')
-    assert 'tiger' in str(response).lower() or '虎' in str(
-        response).lower(), response
 
     api_client = APIClient(http_url)
     model_name = api_client.available_models[0]
@@ -324,7 +328,12 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                                messages=prompt_messages):
         continue
     file.writelines(str(item) + '\n')
-    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
 
     allure.attach.file(restful_log,
                        attachment_type=allure.attachment_type.TEXT)
+
+    assert 'tiger' in str(response).lower() or '虎' in str(
+        response).lower() or 'ski' in str(response).lower() or '滑雪' in str(
+            response).lower(), response
+    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(
+    ) or 'ski' in str(item).lower() or '滑雪' in str(item).lower(), item
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 283ce596f6..d685e869cd 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -19,7 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 908f9a17f5..8918423115 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -19,7 +19,7 @@
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 “-” 表示还没有验证。
 

From e7150ce304360a00aa75183ac67969f13887ab1c Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 21 Nov 2024 18:40:08 +0800
Subject: [PATCH 2/9] update

---
 autotest/config-v100.yaml | 7 +------
 autotest/config.yaml      | 1 +
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index adfca51ab8..b2714087f4 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
@@ -120,13 +121,7 @@ pytorch_quatization:
         - internlm/internlm2_5-20b-chat
         - Qwen/Qwen2-1.5B-Instruct
     w8a8:
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/internlm2-chat-20b
         - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - 01-ai/Yi-6B-Chat
-        - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
     no_kvint4:
         - OpenGVLab/InternVL2-1B
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 2ba60e3ed2..b11a21523c 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports

From 4dd83fda6aee0b9480ae64f345d68372ef519176 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 21 Nov 2024 20:47:48 +0800
Subject: [PATCH 3/9] update

---
 autotest/utils/pipeline_chat.py | 94 +++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index c09e46a42b..a198d8e29d 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -281,7 +281,7 @@ def assert_pipeline_single_element(output,
 PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 PIC_BEIJING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg'  # noqa E501
 PIC_CHONGQING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'  # noqa E501
-PIC_RACCON = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
+PIC_REDPANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
 PIC_PANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'  # noqa E501
 DESC = 'What are the similarities and differences between these two images.'  # noqa E501
 
@@ -421,23 +421,23 @@ def internvl_vl_testcase(config, pipe, file):
                     text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{DESC}'  # noqa E251,E501
                 ),
                 dict(type='image_url',
-                     image_url=dict(max_dynamic_patch=12, url=PIC_RACCON)),
+                     image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
                 dict(type='image_url',
                      image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
             ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images: panda not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images second: panda not in ' +
                     response.text + '\n')
 
     # multi-image multi-round conversation, separate images
@@ -451,23 +451,23 @@ def internvl_vl_testcase(config, pipe, file):
                     +  # noqa E251,E501
                     DESC),
                 dict(type='image_url',
-                     image_url=dict(max_dynamic_patch=12, url=PIC_RACCON)),
+                     image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
                 dict(type='image_url',
                      image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
             ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: separate images: panda not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: separate images second: panda not in ' +
                     response.text + '\n')
 
     # video multi-round conversation
@@ -522,9 +522,9 @@ def load_video(video_path, bound=None, num_segments=32):
 
     messages = [dict(role='user', content=content)]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: video images: red panda not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
@@ -532,9 +532,10 @@ def load_video(video_path, bound=None, num_segments=32):
         dict(role='user',
              content='Describe this video in detail. Don\'t repeat.'))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'red pandas' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: video images: red panda not in ' +
                     response.text + '\n')
 
 
@@ -549,17 +550,19 @@ def llava_vl_testcase(config, pipe, file):
              ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images: buildings not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
 
 
@@ -570,50 +573,53 @@ def MiniCPM_vl_testcase(config, pipe, file):
              content=[
                  dict(type='text', text='Describe the two images in detail.'),
                  dict(type='image_url',
-                      image_url=dict(max_slice_nums=9, url=PIC_RACCON)),
+                      image_url=dict(max_slice_nums=9, url=PIC_REDPANDA)),
                  dict(type='image_url',
                       image_url=dict(max_slice_nums=9, url=PIC_PANDA))
              ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: multiple images: panda not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: multiple images second: panda not in ' +
                     response.text + '\n')
 
     # In-context few-shot learning
+    EXAMPLE1 = 'https://github.com/user-attachments/assets/405d9147-95f6-4f78-8879-606a0aed6707'  # noqa E251,E501
+    EXAMPLE2 = 'https://github.com/user-attachments/assets/9f2c6ed9-2aa5-4189-9c4f-0b9753024ba1'  # noqa E251,E501
+    EXAMPLE3 = 'https://github.com/user-attachments/assets/f335b507-1957-4c22-84ae-ed69ff79df38'  # noqa E251,E501
     question = 'production date'
     messages = [
         dict(role='user',
              content=[
                  dict(type='text', text=question),
-                 dict(type='image_url', image_url=dict(url='example1.jpg')),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE1)),
              ]),
-        dict(role='assistant', content='2023.08.04'),
+        dict(role='assistant', content='2021.08.29'),
         dict(role='user',
              content=[
                  dict(type='text', text=question),
-                 dict(type='image_url', image_url=dict(url='example2.jpg')),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE2)),
              ]),
-        dict(role='assistant', content='2007.04.24'),
+        dict(role='assistant', content='1999.05.15'),
         dict(role='user',
              content=[
                  dict(type='text', text=question),
-                 dict(type='image_url', image_url=dict(url='test.jpg')),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE3)),
              ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = '2021' in response.text.lower() or '14' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: in context learning: 2021 not in ' +
                     response.text + '\n')
 
     # Chat with video
@@ -653,10 +659,11 @@ def uniform_sample(length, n):
 
     messages = [dict(role='user', content=content)]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'red panda' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
-                    response.text + '\n')
+                    ', reason: video example: panda not in ' + response.text +
+                    '\n')
 
 
 def Qwen_vl_testcase(config, pipe, file):
@@ -670,17 +677,19 @@ def Qwen_vl_testcase(config, pipe, file):
              ])
     ]
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images: buildings not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
 
     # image resolution for performance boost
@@ -702,16 +711,19 @@ def Qwen_vl_testcase(config, pipe, file):
     ]
     response = pipe(messages)
     result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: performance boost: buildings not in ' +
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: Multi-turn example: ski not in ' +
+                    ', reason: performance boost second: buildings not in ' +
                     response.text + '\n')
 
 

From d970ad3f380689dcdb8ab7ee23c08d247ce77249 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Fri, 22 Nov 2024 09:41:08 +0800
Subject: [PATCH 4/9] update

---
 autotest/utils/pipeline_chat.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index a198d8e29d..004be0d695 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -427,7 +427,7 @@ def internvl_vl_testcase(config, pipe, file):
             ])
     ]
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images: panda not in ' +
                     response.text + '\n')
@@ -435,7 +435,7 @@ def internvl_vl_testcase(config, pipe, file):
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images second: panda not in ' +
                     response.text + '\n')
@@ -457,7 +457,7 @@ def internvl_vl_testcase(config, pipe, file):
             ])
     ]
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: separate images: panda not in ' +
                     response.text + '\n')
@@ -465,7 +465,7 @@ def internvl_vl_testcase(config, pipe, file):
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: separate images second: panda not in ' +
                     response.text + '\n')
@@ -522,7 +522,7 @@ def load_video(video_path, bound=None, num_segments=32):
 
     messages = [dict(role='user', content=content)]
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: video images: red panda not in ' +
                     response.text + '\n')
@@ -551,7 +551,7 @@ def llava_vl_testcase(config, pipe, file):
     ]
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images: buildings not in ' +
                     response.text + '\n')
@@ -560,7 +560,7 @@ def llava_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
@@ -579,7 +579,7 @@ def MiniCPM_vl_testcase(config, pipe, file):
              ])
     ]
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: multiple images: panda not in ' +
                     response.text + '\n')
@@ -587,7 +587,7 @@ def MiniCPM_vl_testcase(config, pipe, file):
     messages.append(dict(role='assistant', content=response.text))
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
-    result = 'pandas' in response.text.lower() or '熊猫' in response.text.lower()
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: multiple images second: panda not in ' +
                     response.text + '\n')
@@ -678,7 +678,7 @@ def Qwen_vl_testcase(config, pipe, file):
     ]
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images: buildings not in ' +
                     response.text + '\n')
@@ -687,7 +687,7 @@ def Qwen_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
@@ -712,7 +712,7 @@ def Qwen_vl_testcase(config, pipe, file):
     response = pipe(messages)
     result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: performance boost: buildings not in ' +
                     response.text + '\n')
@@ -721,7 +721,7 @@ def Qwen_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: performance boost second: buildings not in ' +
                     response.text + '\n')

From accc42b9e0b635d44fb941732005bfbd9fd2949b Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Fri, 22 Nov 2024 10:00:50 +0800
Subject: [PATCH 5/9] update

---
 autotest/utils/pipeline_chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 004be0d695..a1a873d431 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -619,7 +619,7 @@ def MiniCPM_vl_testcase(config, pipe, file):
     response = pipe(messages)
     result = '2021' in response.text.lower() or '14' in response.text.lower()
     file.writelines('result:' + str(result) +
-                    ', reason: in context learning: 2021 not in ' +
+                    ', reason: in context learning: 2021 or 14 not in ' +
                     response.text + '\n')
 
     # Chat with video
@@ -643,7 +643,7 @@ def uniform_sample(length, n):
         return frames
 
     resource_path = config.get('resource_path')
-    video_path = resource_path + '/video_test.mp4'
+    video_path = resource_path + '/red-panda.mp4'
     frames = encode_video(video_path)
     question = 'Describe the video'
 

From d68d45f16b83273e68c46f5736cb5b7aa8be643d Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Fri, 22 Nov 2024 10:21:42 +0800
Subject: [PATCH 6/9] update

---
 autotest/utils/pipeline_chat.py | 49 +++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index a1a873d431..4318eaca16 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -284,6 +284,7 @@ def assert_pipeline_single_element(output,
 PIC_REDPANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
 PIC_PANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'  # noqa E501
 DESC = 'What are the similarities and differences between these two images.'  # noqa E501
+DESC_ZH = '两张图有什么相同和不同的地方.'  # noqa E501
 
 
 def run_pipeline_vl_chat_test(config,
@@ -397,6 +398,7 @@ def run_pipeline_vl_chat_test(config,
 
     if 'internvl' in model_case.lower():
         internvl_vl_testcase(config, pipe, file)
+        internvl_vl_testcase(config, pipe, file, 'cn')
     if 'llava' in model_case.lower():
         llava_vl_testcase(config, pipe, file)
     if 'minicpm' in model_case.lower():
@@ -410,21 +412,22 @@ def run_pipeline_vl_chat_test(config,
     torch.cuda.empty_cache()
 
 
-def internvl_vl_testcase(config, pipe, file):
+def internvl_vl_testcase(config, pipe, file, lang='en'):
+    if lang == 'cn':
+        description = DESC_ZH
+    else:
+        description = DESC
     # multi-image multi-round conversation, combined images
     messages = [
-        dict(
-            role='user',
-            content=[
-                dict(
-                    type='text',
-                    text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{DESC}'  # noqa E251,E501
-                ),
-                dict(type='image_url',
-                     image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
-                dict(type='image_url',
-                     image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
-            ])
+        dict(role='user',
+             content=[
+                 dict(type='text',
+                      text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{description}'),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+             ])
     ]
     response = pipe(messages)
     result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
@@ -433,7 +436,7 @@ def internvl_vl_testcase(config, pipe, file):
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
-    messages.append(dict(role='user', content=DESC))
+    messages.append(dict(role='user', content=description))
     response = pipe(messages)
     result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
@@ -449,7 +452,7 @@ def internvl_vl_testcase(config, pipe, file):
                     type='text',
                     text=f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\n'
                     +  # noqa E251,E501
-                    DESC),
+                    description),
                 dict(type='image_url',
                      image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
                 dict(type='image_url',
@@ -463,7 +466,7 @@ def internvl_vl_testcase(config, pipe, file):
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
-    messages.append(dict(role='user', content=DESC))
+    messages.append(dict(role='user', content=description))
     response = pipe(messages)
     result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
@@ -508,7 +511,10 @@ def load_video(video_path, bound=None, num_segments=32):
     for i in range(len(imgs)):
         question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
 
-    question += 'What is the red panda doing?'
+    if lang == 'cn':
+        question += '小熊猫在做什么？'
+    else:
+        question += 'What is the red panda doing?'
 
     content = [{'type': 'text', 'text': question}]
     for img in imgs:
@@ -528,9 +534,12 @@ def load_video(video_path, bound=None, num_segments=32):
                     response.text + '\n')
 
     messages.append(dict(role='assistant', content=response.text))
-    messages.append(
-        dict(role='user',
-             content='Describe this video in detail. Don\'t repeat.'))
+    if lang == 'cn':
+        messages.append(dict(role='user', content='描述视频详情，不要重复'))
+    else:
+        messages.append(
+            dict(role='user',
+                 content='Describe this video in detail. Don\'t repeat.'))
     response = pipe(messages)
     result = 'red pandas' in response.text.lower(
     ) or '熊猫' in response.text.lower()

From 4ee75d94d7350103e83fb3f8e3a52eabef5af903 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Fri, 22 Nov 2024 12:55:22 +0800
Subject: [PATCH 7/9] update

---
 autotest/utils/pipeline_chat.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 4318eaca16..023e4ac142 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -399,8 +399,6 @@ def run_pipeline_vl_chat_test(config,
     if 'internvl' in model_case.lower():
         internvl_vl_testcase(config, pipe, file)
         internvl_vl_testcase(config, pipe, file, 'cn')
-    if 'llava' in model_case.lower():
-        llava_vl_testcase(config, pipe, file)
     if 'minicpm' in model_case.lower():
         MiniCPM_vl_testcase(config, pipe, file)
     if 'qwen' in model_case.lower():
@@ -541,7 +539,7 @@ def load_video(video_path, bound=None, num_segments=32):
             dict(role='user',
                  content='Describe this video in detail. Don\'t repeat.'))
     response = pipe(messages)
-    result = 'red pandas' in response.text.lower(
+    result = 'red panda' in response.text.lower(
     ) or '熊猫' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: video images: red panda not in ' +
@@ -560,7 +558,8 @@ def llava_vl_testcase(config, pipe, file):
     ]
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images: buildings not in ' +
                     response.text + '\n')
@@ -569,7 +568,8 @@ def llava_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
@@ -687,7 +687,8 @@ def Qwen_vl_testcase(config, pipe, file):
     ]
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images: buildings not in ' +
                     response.text + '\n')
@@ -696,7 +697,8 @@ def Qwen_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: combined images second: buildings not in ' +
                     response.text + '\n')
@@ -721,7 +723,8 @@ def Qwen_vl_testcase(config, pipe, file):
     response = pipe(messages)
     result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: performance boost: buildings not in ' +
                     response.text + '\n')
@@ -730,7 +733,8 @@ def Qwen_vl_testcase(config, pipe, file):
     messages.append(dict(role='user', content=DESC))
     response = pipe(messages)
     result = 'buildings' in response.text.lower(
-    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower()
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
     file.writelines('result:' + str(result) +
                     ', reason: performance boost second: buildings not in ' +
                     response.text + '\n')

From 10ff0339cb8bd1b2f3c6ddd0c47a903d08d27aaf Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Tue, 26 Nov 2024 13:37:10 +0800
Subject: [PATCH 8/9] update

---
 autotest/config-v100.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index b2714087f4..507f81ceb6 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -97,9 +97,14 @@ pytorch_vl_model:
 
 turbomind_quatization:
     no_awq:
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm-xcomposer2d5-7b
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
         - Qwen/Qwen2-VL-2B-Instruct
         - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
+        - THUDM/glm-4-9b-chat
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:

From a221cd60969fb518d033cdc485eca79941ff53bd Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Tue, 26 Nov 2024 18:31:13 +0800
Subject: [PATCH 9/9] update

---
 autotest/config.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index b11a21523c..1f78411ee9 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -19,6 +19,7 @@ tp_config:
     Qwen2-7B-Instruct-GPTQ-Int4: 2
     InternVL2-40B: 2
     MiniCPM-V-2_6: 2
+    Qwen2.5-72B-Instruct: 4
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
@@ -237,7 +238,8 @@ benchmark_model:
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - THUDM/glm-4-9b-chat
-    - Qwen/Qwen2-7B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-72B-Instruct
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - deepseek-ai/DeepSeek-V2-Lite-Chat