Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

跑比较长文本推理报错 #338

Open
leizhu1989 opened this issue Jul 30, 2024 · 4 comments
Open

跑比较长文本推理报错 #338

leizhu1989 opened this issue Jul 30, 2024 · 4 comments

Comments

@leizhu1989
Copy link

chatglm4-9b-1m模型
批量读取文件推理,随机会报错

GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)
Aborted (core dumped)

@leizhu1989
Copy link
Author

可能是显存不足导致

@leizhu1989
Copy link
Author

不对,有随机性,感觉是有问题

@leizhu1989
Copy link
Author

6
.


题型
多样化

多样化的
练习
能够
帮助
巩固
GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)
Aborted (core dumped)

看着像是生成完了出现的bug,跑了下原版的openai_api代码:

import asyncio
import base64
import io
import json
import logging
import time
from typing import Dict, List, Literal, Optional, Union

import chatglm_cpp
import uvicorn
from fastapi import FastAPI, HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, computed_field
from pydantic_settings import BaseSettings
from sse_starlette.sse import EventSourceResponse

logging.basicConfig(level=logging.INFO, format=r"%(asctime)s - %(module)s - %(levelname)s - %(message)s")

class Settings(BaseSettings):
model: str = "/home/lili/models/chatglm-ggml-int8-9b-1m.bin"
max_length: int = 4096

class ToolCallFunction(BaseModel):
arguments: str
name: str

class ToolCall(BaseModel):
function: Optional[ToolCallFunction] = None
type: Literal["function"]

class ContentText(BaseModel):
type: Literal["text"] = "text"
text: str

class ContentImageUrlData(BaseModel):
url: str
detail: str = "high"

class ContentImageUrl(BaseModel):
type: Literal["image_url"] = "image_url"
image_url: ContentImageUrlData

class ChatMessage(BaseModel):
role: Literal["system", "user", "assistant"]
content: Union[str, List[Union[ContentText, ContentImageUrl]]]
tool_calls: Optional[List[ToolCall]] = None

class DeltaMessage(BaseModel):
role: Optional[Literal["system", "user", "assistant"]] = None
content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = None

class ChatCompletionToolFunction(BaseModel):
description: Optional[str] = None
name: str
parameters: Dict

class ChatCompletionTool(BaseModel):
type: Literal["function"] = "function"
function: ChatCompletionToolFunction

class ChatCompletionRequest(BaseModel):
model: str = "default-model"
messages: List[ChatMessage]
temperature: float = Field(default=0.95, ge=0.0, le=2.0)
top_p: float = Field(default=0.7, ge=0.0, le=1.0)
top_k: int = Field(default=2, ge=3)
stream: bool = False
max_tokens: int = Field(default=2048, ge=0)
tools: Optional[List[ChatCompletionTool]] = None
repeat_penalty: float = Field(default=1.0, ge=0.0, le=2.0)
model_config = {
"json_schema_extra": {
"examples": [{"model": "default-model", "messages": [{"role": "user", "content": "你好"}]}]
}
}

class ChatCompletionResponseChoice(BaseModel):
index: int = 0
message: ChatMessage
finish_reason: Literal["stop", "length", "function_call"]

class ChatCompletionResponseStreamChoice(BaseModel):
index: int = 0
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None

class ChatCompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int

@computed_field
@property
def total_tokens(self) -> int:
    return self.prompt_tokens + self.completion_tokens

class ChatCompletionResponse(BaseModel):
id: str = "chatcmpl"
model: str = "default-model"
object: Literal["chat.completion", "chat.completion.chunk"]
created: int = Field(default_factory=lambda: int(time.time()))
choices: Union[List[ChatCompletionResponseChoice], List[ChatCompletionResponseStreamChoice]]
usage: Optional[ChatCompletionUsage] = None

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "id": "chatcmpl",
                "model": "default-model",
                "object": "chat.completion",
                "created": 1691166146,
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": "你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。",
                        },
                        "finish_reason": "stop",
                    }
                ],
                "usage": {"prompt_tokens": 17, "completion_tokens": 29, "total_tokens": 46},
            }
        ]
    }
}

settings = Settings()
app = FastAPI()
app.add_middleware(
CORSMiddleware, allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"]
)
pipeline = chatglm_cpp.Pipeline(settings.model, max_length=settings.max_length)
lock = asyncio.Lock()

def stream_chat(messages, body):
yield ChatCompletionResponse(
object="chat.completion.chunk",
choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(role="assistant"))],
)

for chunk in pipeline.chat(
messages=messages,
max_length=4500,
max_new_tokens=2000,
max_context_length=2500,
do_sample=0.95,
top_p=body.top_p,
top_k=body.top_k,
temperature=body.temperature,
repetition_penalty=body.repeat_penalty,
stream=True,
):
    yield ChatCompletionResponse(
        object="chat.completion.chunk",
        choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(content=chunk.content))],
    )

yield ChatCompletionResponse(
    object="chat.completion.chunk",
    choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(), finish_reason="stop")],
)

async def stream_chat_event_publisher(history, body):
output = ""
try:
async with lock:
for chunk in stream_chat(history, body):
await asyncio.sleep(0) # yield control back to event loop for cancellation check
output += chunk.choices[0].delta.content or ""
print(chunk.choices[0].delta.content)
yield chunk.model_dump_json(exclude_unset=True)
logging.info(f'prompt: "{history[-1]}", stream response: "{output}"')
except asyncio.CancelledError as e:
logging.info(f'prompt: "{history[-1]}", stream response (partial): "{output}"')
raise e

@app.post("/v1/chat/completions")
async def create_chat_completion(body: ChatCompletionRequest) -> ChatCompletionResponse:
def to_json_arguments(arguments):
def tool_call(**kwargs):
return kwargs

    try:
        return json.dumps(eval(arguments, dict(tool_call=tool_call)))
    except Exception:
        return arguments

if not body.messages:
    raise HTTPException(status.HTTP_400_BAD_REQUEST, "empty messages")

messages = []
for msg in body.messages:
    if isinstance(msg.content, str):
        msg.content = msg.content[:3000]
        messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=msg.content))
    # else:
    #     if not (len(msg.content) == 2 and msg.content[0].type == "text" and msg.content[1].type == "image_url"):
    #         raise HTTPException(
    #             status.HTTP_400_BAD_REQUEST,
    #             "multimodal content must have a text item followed by an image_url item",
    #         )

    #     import numpy as np
    #     from PIL import Image

    #     text = msg.content[0].text
    #     image_url = msg.content[1].image_url.url
    #     if image_url.startswith("data:"):
    #         image_bytes = base64.b64decode(image_url.split(",")[1])
    #     else:
    #         import requests

    #         image_bytes = requests.get(image_url).content
    #     image = chatglm_cpp.Image(np.asarray(Image.open(io.BytesIO(image_bytes))))

    #     messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=text, image=image))

if body.tools:
    system_content = (
        "Answer the following questions as best as you can. You have access to the following tools:\n"
        + json.dumps([tool.model_dump() for tool in body.tools], indent=4)
    )
    messages.insert(0, chatglm_cpp.ChatMessage(role="system", content=system_content))

if body.stream:
    generator = stream_chat_event_publisher(messages, body)
    return EventSourceResponse(generator)

max_context_length = 2500
output = pipeline.chat(
    messages=messages,
    max_length=4500,
    max_new_tokens=2000,
    max_context_length=2500,
    do_sample=0.95,
    top_p=body.top_p,
    top_k=body.top_k,
    temperature=body.temperature,
    repetition_penalty=body.repeat_penalty,
)
logging.info(f'prompt: "{messages[-1].content}", sync response: "{output.content}"')
prompt_tokens = len(pipeline.tokenizer.apply_chat_template(messages, max_context_length))
completion_tokens = len(pipeline.tokenizer.encode(output.content, body.max_tokens))

finish_reason = "stop"
tool_calls = None
if output.tool_calls:
    tool_calls = [
        ToolCall(
            type=tool_call.type,
            function=ToolCallFunction(
                name=tool_call.function.name, arguments=to_json_arguments(tool_call.function.arguments)
            ),
        )
        for tool_call in output.tool_calls
    ]
    finish_reason = "function_call"

return ChatCompletionResponse(
    object="chat.completion",
    choices=[
        ChatCompletionResponseChoice(
            message=ChatMessage(role="assistant", content=output.content, tool_calls=tool_calls),
            finish_reason=finish_reason,
        )
    ],
    usage=ChatCompletionUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
)

class ModelCard(BaseModel):
id: str
object: Literal["model"] = "model"
owned_by: str = "owner"
permission: List = []

class ModelList(BaseModel):
object: Literal["list"] = "list"
data: List[ModelCard] = []

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "object": "list",
                "data": [{"id": "gpt-3.5-turbo", "object": "model", "owned_by": "owner", "permission": []}],
            }
        ]
    }
}

@app.get("/v1/models")
async def list_models() -> ModelList:
return ModelList(data=[ModelCard(id="gpt-3.5-turbo")])

if name == "main":
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

@li-plus
Copy link
Owner

li-plus commented Jul 31, 2024

应该是bug,可以提供下最小复现的例子吗

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants