跑比较长文本推理报错 #338

leizhu1989 · 2024-07-30T06:01:42Z

chatglm4-9b-1m模型
批量读取文件推理，随机会报错

GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)
Aborted (core dumped)

leizhu1989 · 2024-07-30T07:00:58Z

可能是显存不足导致

leizhu1989 · 2024-07-30T07:36:51Z

不对，有随机性，感觉是有问题

leizhu1989 · 2024-07-30T07:50:40Z

6
.
注
重
题型
多样化

多样化的
练习
能够
帮助
巩固
GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)
Aborted (core dumped)

看着像是生成完了出现的bug，跑了下原版的openai_api代码：

import asyncio
import base64
import io
import json
import logging
import time
from typing import Dict, List, Literal, Optional, Union

import chatglm_cpp
import uvicorn
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, computed_field
from pydantic_settings import BaseSettings
from sse_starlette.sse import EventSourceResponse

logging.basicConfig(level=logging.INFO, format=r"%(asctime)s - %(module)s - %(levelname)s - %(message)s")

class Settings(BaseSettings):
model: str = "/home/lili/models/chatglm-ggml-int8-9b-1m.bin"
max_length: int = 4096

class ToolCallFunction(BaseModel):
arguments: str
name: str

class ToolCall(BaseModel):
function: Optional[ToolCallFunction] = None
type: Literal["function"]

class ContentText(BaseModel):
type: Literal["text"] = "text"
text: str

class ContentImageUrlData(BaseModel):
url: str
detail: str = "high"

class ContentImageUrl(BaseModel):
type: Literal["image_url"] = "image_url"
image_url: ContentImageUrlData

class ChatMessage(BaseModel):
role: Literal["system", "user", "assistant"]
content: Union[str, List[Union[ContentText, ContentImageUrl]]]
tool_calls: Optional[List[ToolCall]] = None

class DeltaMessage(BaseModel):
role: Optional[Literal["system", "user", "assistant"]] = None
content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = None

class ChatCompletionToolFunction(BaseModel):
description: Optional[str] = None
name: str
parameters: Dict

class ChatCompletionTool(BaseModel):
type: Literal["function"] = "function"
function: ChatCompletionToolFunction

class ChatCompletionRequest(BaseModel):
model: str = "default-model"
messages: List[ChatMessage]
temperature: float = Field(default=0.95, ge=0.0, le=2.0)
top_p: float = Field(default=0.7, ge=0.0, le=1.0)
top_k: int = Field(default=2, ge=3)
stream: bool = False
max_tokens: int = Field(default=2048, ge=0)
tools: Optional[List[ChatCompletionTool]] = None
repeat_penalty: float = Field(default=1.0, ge=0.0, le=2.0)
model_config = {
"json_schema_extra": {
"examples": [{"model": "default-model", "messages": [{"role": "user", "content": "你好"}]}]
}
}

class ChatCompletionResponseChoice(BaseModel):
index: int = 0
message: ChatMessage
finish_reason: Literal["stop", "length", "function_call"]

class ChatCompletionResponseStreamChoice(BaseModel):
index: int = 0
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None

class ChatCompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int

@computed_field
@property
def total_tokens(self) -> int:
    return self.prompt_tokens + self.completion_tokens

class ChatCompletionResponse(BaseModel):
id: str = "chatcmpl"
model: str = "default-model"
object: Literal["chat.completion", "chat.completion.chunk"]
created: int = Field(default_factory=lambda: int(time.time()))
choices: Union[List[ChatCompletionResponseChoice], List[ChatCompletionResponseStreamChoice]]
usage: Optional[ChatCompletionUsage] = None

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "id": "chatcmpl",
                "model": "default-model",
                "object": "chat.completion",
                "created": 1691166146,
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": "你好👋！我是人工智能助手 ChatGLM2-6B，很高兴见到你，欢迎问我任何问题。",
                        },
                        "finish_reason": "stop",
                    }
                ],
                "usage": {"prompt_tokens": 17, "completion_tokens": 29, "total_tokens": 46},
            }
        ]
    }
}

settings = Settings()
app = FastAPI()
app.add_middleware(
CORSMiddleware, allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"]
)
pipeline = chatglm_cpp.Pipeline(settings.model, max_length=settings.max_length)
lock = asyncio.Lock()

def stream_chat(messages, body):
yield ChatCompletionResponse(
object="chat.completion.chunk",
choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(role="assistant"))],
)

for chunk in pipeline.chat(
messages=messages,
max_length=4500,
max_new_tokens=2000,
max_context_length=2500,
do_sample=0.95,
top_p=body.top_p,
top_k=body.top_k,
temperature=body.temperature,
repetition_penalty=body.repeat_penalty,
stream=True,
):
    yield ChatCompletionResponse(
        object="chat.completion.chunk",
        choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(content=chunk.content))],
    )

yield ChatCompletionResponse(
    object="chat.completion.chunk",
    choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(), finish_reason="stop")],
)

async def stream_chat_event_publisher(history, body):
output = ""
try:
async with lock:
for chunk in stream_chat(history, body):
await asyncio.sleep(0) # yield control back to event loop for cancellation check
output += chunk.choices[0].delta.content or ""
print(chunk.choices[0].delta.content)
yield chunk.model_dump_json(exclude_unset=True)
logging.info(f'prompt: "{history[-1]}", stream response: "{output}"')
except asyncio.CancelledError as e:
logging.info(f'prompt: "{history[-1]}", stream response (partial): "{output}"')
raise e

@app.post("/v1/chat/completions")
async def create_chat_completion(body: ChatCompletionRequest) -> ChatCompletionResponse:
def to_json_arguments(arguments):
def tool_call(**kwargs):
return kwargs

    try:
        return json.dumps(eval(arguments, dict(tool_call=tool_call)))
    except Exception:
        return arguments

if not body.messages:
    raise HTTPException(status.HTTP_400_BAD_REQUEST, "empty messages")

messages = []
for msg in body.messages:
    if isinstance(msg.content, str):
        msg.content = msg.content[:3000]
        messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=msg.content))
    # else:
    #     if not (len(msg.content) == 2 and msg.content[0].type == "text" and msg.content[1].type == "image_url"):
    #         raise HTTPException(
    #             status.HTTP_400_BAD_REQUEST,
    #             "multimodal content must have a text item followed by an image_url item",
    #         )

    #     import numpy as np
    #     from PIL import Image

    #     text = msg.content[0].text
    #     image_url = msg.content[1].image_url.url
    #     if image_url.startswith("data:"):
    #         image_bytes = base64.b64decode(image_url.split(",")[1])
    #     else:
    #         import requests

    #         image_bytes = requests.get(image_url).content
    #     image = chatglm_cpp.Image(np.asarray(Image.open(io.BytesIO(image_bytes))))

    #     messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=text, image=image))

if body.tools:
    system_content = (
        "Answer the following questions as best as you can. You have access to the following tools:\n"
        + json.dumps([tool.model_dump() for tool in body.tools], indent=4)
    )
    messages.insert(0, chatglm_cpp.ChatMessage(role="system", content=system_content))

if body.stream:
    generator = stream_chat_event_publisher(messages, body)
    return EventSourceResponse(generator)

max_context_length = 2500
output = pipeline.chat(
    messages=messages,
    max_length=4500,
    max_new_tokens=2000,
    max_context_length=2500,
    do_sample=0.95,
    top_p=body.top_p,
    top_k=body.top_k,
    temperature=body.temperature,
    repetition_penalty=body.repeat_penalty,
)
logging.info(f'prompt: "{messages[-1].content}", sync response: "{output.content}"')
prompt_tokens = len(pipeline.tokenizer.apply_chat_template(messages, max_context_length))
completion_tokens = len(pipeline.tokenizer.encode(output.content, body.max_tokens))

finish_reason = "stop"
tool_calls = None
if output.tool_calls:
    tool_calls = [
        ToolCall(
            type=tool_call.type,
            function=ToolCallFunction(
                name=tool_call.function.name, arguments=to_json_arguments(tool_call.function.arguments)
            ),
        )
        for tool_call in output.tool_calls
    ]
    finish_reason = "function_call"

return ChatCompletionResponse(
    object="chat.completion",
    choices=[
        ChatCompletionResponseChoice(
            message=ChatMessage(role="assistant", content=output.content, tool_calls=tool_calls),
            finish_reason=finish_reason,
        )
    ],
    usage=ChatCompletionUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
)

class ModelCard(BaseModel):
id: str
object: Literal["model"] = "model"
owned_by: str = "owner"
permission: List = []

class ModelList(BaseModel):
object: Literal["list"] = "list"
data: List[ModelCard] = []

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "object": "list",
                "data": [{"id": "gpt-3.5-turbo", "object": "model", "owned_by": "owner", "permission": []}],
            }
        ]
    }
}

@app.get("/v1/models")
async def list_models() -> ModelList:
return ModelList(data=[ModelCard(id="gpt-3.5-turbo")])

if name == "main":
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

li-plus · 2024-07-31T04:29:23Z

应该是bug，可以提供下最小复现的例子吗

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

跑比较长文本推理报错 #338

跑比较长文本推理报错 #338

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

li-plus commented Jul 31, 2024

跑比较长文本推理报错 #338

跑比较长文本推理报错 #338

Comments

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

leizhu1989 commented Jul 30, 2024

li-plus commented Jul 31, 2024