Skip to content

Commit 2d4fff9

Browse files
committed
更新vllm==0.6.6.post1
1 parent 29090cc commit 2d4fff9

File tree

7 files changed

+461
-39
lines changed

7 files changed

+461
-39
lines changed

gpt_server/model_backend/utils.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,34 @@
1-
from typing import List
1+
from typing import List, Type, Union
2+
from pydantic import BaseModel
23
from transformers.generation.logits_process import LogitsProcessor
4+
from transformers import PreTrainedTokenizerBase
35
from transformers.generation.stopping_criteria import (
46
StoppingCriteria,
57
StoppingCriteriaList,
68
STOPPING_CRITERIA_INPUTS_DOCSTRING,
79
add_start_docstrings,
810
)
9-
11+
import xgrammar as xgr
1012
import torch
1113

1214

15+
class XgrammarLogitsProcessor(LogitsProcessor):
16+
def __init__(self, tokenizer: PreTrainedTokenizerBase):
17+
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
18+
self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
19+
# -----------
20+
21+
def get_grammar_compiler(self, schema: Union[str, Type[BaseModel]]):
22+
compiled_grammar = self.grammar_compiler.compile_json_schema(schema)
23+
self.xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
24+
return self.xgr_logits_processor
25+
26+
def __call__(
27+
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
28+
) -> torch.FloatTensor:
29+
return self.xgr_logits_processor(input_ids=input_ids, scores=scores)
30+
31+
1332
class InvalidScoreLogitsProcessor(LogitsProcessor):
1433
def __call__(
1534
self, input_ids: torch.LongTensor, scores: torch.FloatTensor

gpt_server/model_backend/vllm_backend.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
126126
choice=None,
127127
grammar=None,
128128
json_object=guided_json_object,
129-
backend="lm-format-enforcer",
129+
backend="xgrammar",
130130
whitespace_pattern=None,
131131
)
132132
# ---- 支持 response_format,但是官方对BPE分词器的支持仍然太差 ----

gpt_server/model_worker/embedding_infinity.py

+31-23
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from loguru import logger
55

66
from infinity_emb import AsyncEngineArray, EngineArgs, AsyncEmbeddingEngine
7+
from infinity_emb.inference.select_model import get_engine_type_from_config
78
from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase
89

910
label_to_category = {
@@ -49,30 +50,26 @@ def __init__(
4950
bettertransformer = True
5051
if model_type is not None and "deberta" in model_type:
5152
bettertransformer = False
52-
self.engine: AsyncEmbeddingEngine = AsyncEngineArray.from_args(
53-
[
54-
EngineArgs(
55-
model_name_or_path=model_path,
56-
engine="torch",
57-
embedding_dtype="float32",
58-
dtype="float32",
59-
device=device,
60-
bettertransformer=bettertransformer,
61-
)
62-
]
63-
)[0]
53+
engine_args = EngineArgs(
54+
model_name_or_path=model_path,
55+
engine="torch",
56+
embedding_dtype="float32",
57+
dtype="float32",
58+
device=device,
59+
bettertransformer=bettertransformer,
60+
)
61+
engine_type = get_engine_type_from_config(engine_args)
62+
engine_type_str = str(engine_type)
63+
if "EmbedderEngine" in engine_type_str:
64+
self.mode = "embedding"
65+
elif "RerankEngine" in engine_type_str:
66+
self.mode = "rerank"
67+
elif "ImageEmbedEngine" in engine_type_str:
68+
self.mode = "image"
69+
self.engine: AsyncEmbeddingEngine = AsyncEngineArray.from_args([engine_args])[0]
6470
loop = asyncio.get_running_loop()
6571
loop.create_task(self.engine.astart())
66-
self.mode = "embedding"
67-
# rerank
68-
for model_name in model_names:
69-
if "rerank" in model_name:
70-
self.mode = "rerank"
71-
break
72-
if self.mode == "rerank":
73-
logger.info("正在使用 rerank 模型...")
74-
elif self.mode == "embedding":
75-
logger.info("正在使用 embedding 模型...")
72+
logger.info(f"正在使用 {self.mode} 模型...")
7673
logger.info(f"模型:{model_names[0]}")
7774

7875
async def astart(self):
@@ -83,7 +80,7 @@ async def get_embeddings(self, params):
8380
logger.info(f"worker_id: {self.worker_id}")
8481
self.call_ct += 1
8582
ret = {"embedding": [], "token_num": 0}
86-
texts = params["input"]
83+
texts: list = params["input"]
8784
if self.mode == "embedding":
8885
texts = list(map(lambda x: x.replace("\n", " "), texts))
8986
embeddings, usage = await self.engine.embed(sentences=texts)
@@ -105,6 +102,17 @@ async def get_embeddings(self, params):
105102
embedding = [
106103
[round(float(score["relevance_score"]), 6)] for score in ranking
107104
]
105+
elif self.mode == "image":
106+
if (
107+
isinstance(texts[0], bytes)
108+
or "http" in texts[0]
109+
or "data:image" in texts[0]
110+
):
111+
embeddings, usage = await self.engine.image_embed(images=texts)
112+
else:
113+
embeddings, usage = await self.engine.embed(sentences=texts)
114+
115+
embedding = [embedding.tolist() for embedding in embeddings]
108116
ret["embedding"] = embedding
109117
ret["token_num"] = usage
110118
return ret

gpt_server/utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def kill_child_processes(parent_pid, including_parent=False):
4242

4343
def signal_handler(signum, frame):
4444
print("\nCtrl-C detected! Cleaning up...")
45-
kill_child_processes(parent_pid, including_parent=False)
45+
# kill_child_processes(parent_pid, including_parent=False)
46+
stop_server()
4647
exit(0) # 正常退出程序
4748

4849

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies = [
2121
"torch==2.5.1",
2222
"torchvision==0.20.1",
2323
"transformers==4.45.2",
24-
"vllm==0.6.5",
24+
"vllm==0.6.6.post1",
2525
"qwen_vl_utils",
2626
"evalscope[perf]==0.7.0",
2727
"modelscope==1.20.1",

tests/test_openai_completion_response_format.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,17 @@
33

44
# 新版本 opnai
55
client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
6-
76
# 方式一
87
output = client.chat.completions.create(
9-
model="qwen-32b",
8+
model="qwen-3b",
109
messages=[{"role": "user", "content": "南京到北京多远"}],
1110
response_format={"type": "text"},
1211
)
1312
print(output.choices[0].message.content)
1413
print("-" * 100)
1514
# 方式二
1615
output = client.chat.completions.create(
17-
model="qwen-32b",
16+
model="qwen-3b",
1817
messages=[
1918
{"role": "system", "content": "用json进行回答"},
2019
{"role": "user", "content": "南京到北京多远"},
@@ -32,7 +31,7 @@ class Distance(BaseModel):
3231

3332

3433
output = client.beta.chat.completions.parse(
35-
model="qwen-32b",
34+
model="qwen-3b",
3635
messages=[{"role": "user", "content": "南京到北京多远"}],
3736
response_format=Distance,
3837
)

0 commit comments

Comments
 (0)