Skip to content

Commit

Permalink
feat: Support vLLM as a model backend (#734)
Browse files Browse the repository at this point in the history
  • Loading branch information
zechengz authored Jul 19, 2024
1 parent 6bcdb31 commit bc54301
Show file tree
Hide file tree
Showing 10 changed files with 397 additions and 5 deletions.
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,41 @@ Please note that the environment variable is session-specific. If you open a new
print(assistant_response.msg.content)
```

## Use Open-Source Models as Backends (ex. using vLLM to set Phi-3 locally)
- [Install vLLM](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- After setting up vLLM, start an OpenAI compatible server for example by
```bash
python -m vllm.entrypoints.openai.api_server --model microsoft/Phi-3-mini-4k-instruct --api-key vllm --dtype bfloat16
```
- Create and run following script (more details please refer to this [example](https://github.com/camel-ai/camel/blob/master/examples/models/vllm_model_example.py))
```python
from camel.agents import ChatAgent
from camel.messages import BaseMessage
from camel.models import ModelFactory
from camel.types import ModelPlatformType
vllm_model = ModelFactory.create(
model_platform=ModelPlatformType.VLLM,
model_type="microsoft/Phi-3-mini-4k-instruct",
url="http://localhost:8000/v1",
model_config_dict={"temperature": 0.0},
api_key="vllm",
)
assistant_sys_msg = BaseMessage.make_assistant_message(
role_name="Assistant",
content="You are a helpful assistant.",
)
agent = ChatAgent(assistant_sys_msg, model=vllm_model, token_limit=4096)
user_msg = BaseMessage.make_user_message(
role_name="User",
content="Say hi to CAMEL AI",
)
assistant_response = agent.step(user_msg)
print(assistant_response.msg.content)
```

## Data (Hosted on Hugging Face)
| Dataset | Chat format | Instruction format | Chat format (translated) |
|----------------|-----------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|
Expand Down
3 changes: 3 additions & 0 deletions camel/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
ChatGPTConfig,
OpenSourceConfig,
)
from .vllm_config import VLLM_API_PARAMS, VLLMConfig
from .zhipuai_config import ZHIPUAI_API_PARAMS, ZhipuAIConfig

__all__ = [
Expand All @@ -41,4 +42,6 @@
'ZHIPUAI_API_PARAMS',
'GeminiConfig',
'Gemini_API_PARAMS',
'VLLMConfig',
'VLLM_API_PARAMS',
]
103 changes: 103 additions & 0 deletions camel/configs/vllm_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
from __future__ import annotations

from dataclasses import asdict, dataclass, field
from typing import Sequence

from openai._types import NOT_GIVEN, NotGiven

from camel.configs.base_config import BaseConfig


# flake8: noqa: E501
@dataclass(frozen=True)
class VLLMConfig(BaseConfig):
r"""Defines the parameters for generating chat completions using the
OpenAI API.
Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
Args:
temperature (float, optional): Sampling temperature to use, between
:obj:`0` and :obj:`2`. Higher values make the output more random,
while lower values make it more focused and deterministic.
(default: :obj:`0.2`)
top_p (float, optional): An alternative to sampling with temperature,
called nucleus sampling, where the model considers the results of
the tokens with top_p probability mass. So :obj:`0.1` means only
the tokens comprising the top 10% probability mass are considered.
(default: :obj:`1.0`)
n (int, optional): How many chat completion choices to generate for
each input message. (default: :obj:`1`)
response_format (object, optional): An object specifying the format
that the model must output. Compatible with GPT-4 Turbo and all
GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106. Setting to
{"type": "json_object"} enables JSON mode, which guarantees the
message the model generates is valid JSON. Important: when using
JSON mode, you must also instruct the model to produce JSON
yourself via a system or user message. Without this, the model
may generate an unending stream of whitespace until the generation
reaches the token limit, resulting in a long-running and seemingly
"stuck" request. Also note that the message content may be
partially cut off if finish_reason="length", which indicates the
generation exceeded max_tokens or the conversation exceeded the
max context length.
stream (bool, optional): If True, partial message deltas will be sent
as data-only server-sent events as they become available.
(default: :obj:`False`)
stop (str or list, optional): Up to :obj:`4` sequences where the API
will stop generating further tokens. (default: :obj:`None`)
max_tokens (int, optional): The maximum number of tokens to generate
in the chat completion. The total length of input tokens and
generated tokens is limited by the model's context length.
(default: :obj:`None`)
presence_penalty (float, optional): Number between :obj:`-2.0` and
:obj:`2.0`. Positive values penalize new tokens based on whether
they appear in the text so far, increasing the model's likelihood
to talk about new topics. See more information about frequency and
presence penalties. (default: :obj:`0.0`)
frequency_penalty (float, optional): Number between :obj:`-2.0` and
:obj:`2.0`. Positive values penalize new tokens based on their
existing frequency in the text so far, decreasing the model's
likelihood to repeat the same line verbatim. See more information
about frequency and presence penalties. (default: :obj:`0.0`)
logit_bias (dict, optional): Modify the likelihood of specified tokens
appearing in the completion. Accepts a json object that maps tokens
(specified by their token ID in the tokenizer) to an associated
bias value from :obj:`-100` to :obj:`100`. Mathematically, the bias
is added to the logits generated by the model prior to sampling.
The exact effect will vary per model, but values between:obj:` -1`
and :obj:`1` should decrease or increase likelihood of selection;
values like :obj:`-100` or :obj:`100` should result in a ban or
exclusive selection of the relevant token. (default: :obj:`{}`)
user (str, optional): A unique identifier representing your end-user,
which can help OpenAI to monitor and detect abuse.
(default: :obj:`""`)
"""

temperature: float = 0.2 # openai default: 1.0
top_p: float = 1.0
n: int = 1
stream: bool = False
stop: str | Sequence[str] | NotGiven = NOT_GIVEN
max_tokens: int | NotGiven = NOT_GIVEN
presence_penalty: float = 0.0
response_format: dict | NotGiven = NOT_GIVEN
frequency_penalty: float = 0.0
logit_bias: dict = field(default_factory=dict)
user: str = ""


VLLM_API_PARAMS = {param for param in asdict(VLLMConfig()).keys()}
2 changes: 2 additions & 0 deletions camel/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .openai_audio_models import OpenAIAudioModels
from .openai_model import OpenAIModel
from .stub_model import StubModel
from .vllm_model import VLLMModel
from .zhipuai_model import ZhipuAIModel

__all__ = [
Expand All @@ -36,5 +37,6 @@
'OpenAIAudioModels',
'NemotronModel',
'OllamaModel',
'VLLMModel',
'GeminiModel',
]
4 changes: 4 additions & 0 deletions camel/models/model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from camel.models.open_source_model import OpenSourceModel
from camel.models.openai_model import OpenAIModel
from camel.models.stub_model import StubModel
from camel.models.vllm_model import VLLMModel
from camel.models.zhipuai_model import ZhipuAIModel
from camel.types import ModelPlatformType, ModelType

Expand Down Expand Up @@ -83,6 +84,9 @@ def create(
if model_platform.is_ollama:
model_class = OllamaModel
return model_class(model_type, model_config_dict, url)
elif model_platform.is_vllm:
model_class = VLLMModel
return model_class(model_type, model_config_dict, url, api_key)
elif model_platform.is_litellm:
model_class = LiteLLMModel
else:
Expand Down
135 changes: 135 additions & 0 deletions camel/models/vllm_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
from typing import Any, Dict, List, Optional, Union

from openai import OpenAI, Stream

from camel.configs import VLLM_API_PARAMS
from camel.messages import OpenAIMessage
from camel.types import ChatCompletion, ChatCompletionChunk, ModelType
from camel.utils import BaseTokenCounter, OpenAITokenCounter


# flake8: noqa: E501
class VLLMModel:
r"""vLLM service interface."""

def __init__(
self,
model_type: str,
model_config_dict: Dict[str, Any],
url: Optional[str] = None,
api_key: Optional[str] = None,
) -> None:
r"""Constructor for vLLM backend with OpenAI compatibility.
# Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
Args:
model_type (str): Model for which a backend is created.
model_config_dict (Dict[str, Any]): A dictionary that will
be fed into openai.ChatCompletion.create().
url (Optional[str]): The url to the model service. (default:
:obj:`None`)
api_key (Optional[str]): The API key for authenticating with the
model service.
"""
self.model_type = model_type
self.model_config_dict = model_config_dict
# Use OpenAI cilent as interface call vLLM
self._client = OpenAI(
timeout=60,
max_retries=3,
base_url=url,
api_key=api_key,
)
self._token_counter: Optional[BaseTokenCounter] = None
self.check_model_config()

@property
def token_counter(self) -> BaseTokenCounter:
r"""Initialize the token counter for the model backend.
Returns:
BaseTokenCounter: The token counter following the model's
tokenization style.
"""
# NOTE: Use OpenAITokenCounter temporarily
if not self._token_counter:
self._token_counter = OpenAITokenCounter(ModelType.GPT_3_5_TURBO)
return self._token_counter

def check_model_config(self):
r"""Check whether the model configuration contains any
unexpected arguments to vLLM API.
Raises:
ValueError: If the model configuration dictionary contains any
unexpected arguments to OpenAI API.
"""
for param in self.model_config_dict:
if param not in VLLM_API_PARAMS:
raise ValueError(
f"Unexpected argument `{param}` is "
"input into vLLM model backend."
)

def run(
self,
messages: List[OpenAIMessage],
) -> Union[ChatCompletion, Stream[ChatCompletionChunk]]:
r"""Runs inference of OpenAI chat completion.
Args:
messages (List[OpenAIMessage]): Message list with the chat history
in OpenAI API format.
Returns:
Union[ChatCompletion, Stream[ChatCompletionChunk]]:
`ChatCompletion` in the non-stream mode, or
`Stream[ChatCompletionChunk]` in the stream mode.
"""

response = self._client.chat.completions.create(
messages=messages,
model=self.model_type,
**self.model_config_dict,
)
return response

@property
def token_limit(self) -> int:
"""Returns the maximum token limit for the given model.
Returns:
int: The maximum token limit for the given model.
"""
max_tokens = self.model_config_dict.get("max_tokens")
if isinstance(max_tokens, int):
return max_tokens
print(
"Must set `max_tokens` as an integer in `model_config_dict` when"
" setting up the model. Using 4096 as default value."
)
return 4096

@property
def stream(self) -> bool:
r"""Returns whether the model is in stream mode, which sends partial
results each time.
Returns:
bool: Whether the model is in stream mode.
"""
return self.model_config_dict.get('stream', False)
6 changes: 6 additions & 0 deletions camel/types/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ class ModelPlatformType(Enum):
ZHIPU = "zhipuai"
DEFAULT = "default"
GEMINI = "gemini"
VLLM = "vllm"

@property
def is_openai(self) -> bool:
Expand All @@ -365,6 +366,11 @@ def is_ollama(self) -> bool:
r"""Returns whether this platform is ollama."""
return self is ModelPlatformType.OLLAMA

@property
def is_vllm(self) -> bool:
r"""Returns whether this platform is vllm."""
return self is ModelPlatformType.VLLM

@property
def is_litellm(self) -> bool:
r"""Returns whether this platform is litellm."""
Expand Down
45 changes: 45 additions & 0 deletions examples/models/vllm_model_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========

from camel.agents import ChatAgent
from camel.messages import BaseMessage
from camel.models import ModelFactory
from camel.types import ModelPlatformType

vllm_model = ModelFactory.create(
model_platform=ModelPlatformType.VLLM,
model_type="microsoft/Phi-3-mini-4k-instruct",
url="http://localhost:8000/v1",
model_config_dict={"temperature": 0.0},
api_key="vllm",
)

assistant_sys_msg = BaseMessage.make_assistant_message(
role_name="Assistant",
content="You are a helpful assistant.",
)
agent = ChatAgent(assistant_sys_msg, model=vllm_model, token_limit=4096)

user_msg = BaseMessage.make_user_message(
role_name="User",
content="Say hi to CAMEL AI",
)
assistant_response = agent.step(user_msg)
print(assistant_response.msg.content)

"""
===============================================================================
Hello! I'm Phi, an AI developed by Microsoft. How can I help you today?
===============================================================================
"""
Loading

0 comments on commit bc54301

Please sign in to comment.