From beb16b2c810a87b28e7b8a7aa29d26f842f654b9 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 10 Dec 2024 03:27:11 -0700 Subject: [PATCH 001/357] [Bugfix] Handle <|tool_call|> token in granite tool parser (#11039) Signed-off-by: Travis Johnson --- vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index b5854ca39ab47..00917c866e496 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -35,11 +35,13 @@ class GraniteToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) + self.bot_token = "<|tool_call|>" def extract_tool_calls( self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: - stripped = model_output.strip() + # remove whitespace and the BOT token if it exists + stripped = model_output.strip().removeprefix(self.bot_token).lstrip() if not stripped or stripped[0] != '[': return ExtractedToolCallInformation(tools_called=False, tool_calls=[], From d05f88679bedd73939251a17c3d785a354b2946c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 10 Dec 2024 19:12:01 +0800 Subject: [PATCH 002/357] [Misc][LoRA] Add PEFTHelper for LoRA (#11003) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_manager.py | 58 +++++++++++++++++++++++++-- vllm/lora/lora.py | 18 +++++++++ vllm/lora/models.py | 42 ++++++++------------ vllm/lora/peft_helper.py | 70 +++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 28 deletions(-) create mode 100644 vllm/lora/peft_helper.py diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 8d109b2c81503..0b76f466702fc 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,3 +1,4 @@ +import json import os from typing import Dict, List @@ -13,6 +14,7 @@ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager) +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) @@ -30,18 +32,68 @@ ] +def test_peft_helper(sql_lora_files): + lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") + with open(lora_config_path) as f: + config = json.load(f) + peft_helper = PEFTHelper.from_dict(config) + assert peft_helper.r == 8 + assert peft_helper.lora_alpha == 16 + assert peft_helper.target_modules == [ + "q_proj", + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + + expected_error = "vLLM only supports modules_to_save being None." + with pytest.raises(ValueError, match=expected_error): + config = dict( + r=8, + lora_alpha=16, + target_modules=["gate_proj"], + modules_to_save=["lm_head"], + ) + PEFTHelper.from_dict(config) + expected_error = "vLLM does not yet support RSLoRA." + with pytest.raises(ValueError, match=expected_error): + config = dict(r=8, + lora_alpha=16, + target_modules=["gate_proj"], + use_rslora=True) + PEFTHelper.from_dict(config) + + expected_error = "vLLM does not yet support DoRA." + with pytest.raises(ValueError, match=expected_error): + config = dict(r=8, + lora_alpha=16, + target_modules=["gate_proj"], + use_dora=True) + PEFTHelper.from_dict(config) + + @pytest.mark.parametrize("device", CUDA_DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) + + lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") + with open(lora_config_path) as f: + config = json.load(f) + + peft_helper = PEFTHelper.from_dict(config) lora_model = LoRAModel.from_lora_tensors( 1, - 8, - 16, tensors, - device, + peft_helper=peft_helper, + device=device, embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES) diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index b648312ba76ec..dde347b78bf81 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -4,6 +4,7 @@ import torch import torch.types +from vllm.lora.peft_helper import PEFTHelper from vllm.utils import is_pin_memory_available @@ -59,6 +60,23 @@ def extra_vocab_size(self) -> int: return self.embeddings_tensor.shape[ 0] if self.embeddings_tensor is not None else 0 + @classmethod + def from_config( + cls, + module_name: str, + peft_helper: PEFTHelper, + embeddings_tensor: Optional[torch.Tensor] = None, + ) -> "LoRALayerWeights": + return cls( + module_name, + peft_helper.r, + peft_helper.lora_alpha, + None, + None, + None, + embeddings_tensor, + ) + @classmethod def create_dummy_lora_weights( cls, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 49cd9f0c236ad..70806a77b9fff 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -21,6 +21,7 @@ LinearScalingRotaryEmbeddingWithLora, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, is_regex_target_modules, @@ -104,14 +105,12 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: def from_lora_tensors( cls, lora_model_id: int, - rank: int, - lora_alpha: int, tensors: Dict[str, torch.Tensor], + peft_helper: PEFTHelper, device: str = "cuda", dtype: Optional[torch.dtype] = None, embeddings: Optional[Dict[str, torch.Tensor]] = None, target_embedding_padding: Optional[int] = None, - scaling_factor: Optional[float] = None, embedding_modules: Optional[Dict[str, str]] = None, embedding_padding_modules: Optional[List[str]] = None, ) -> "LoRAModel": @@ -135,10 +134,9 @@ def from_lora_tensors( if pin_memory: lora_embeddings_tensor = ( lora_embeddings_tensor.pin_memory()) - loras[module_name] = LoRALayerWeights(module_name, rank, - lora_alpha, None, None, - None, - lora_embeddings_tensor) + loras[module_name] = LoRALayerWeights.from_config( + module_name, peft_helper, lora_embeddings_tensor) + if is_bias: loras[module_name].bias = tensor.to(device=device, dtype=dtype).t() @@ -170,7 +168,11 @@ def from_lora_tensors( for lora in loras.values(): lora.optimize() - return cls(lora_model_id, rank, loras, scaling_factor=scaling_factor) + + return cls(lora_model_id, + peft_helper.r, + loras, + scaling_factor=peft_helper.vllm_scaling_factor) @classmethod def from_local_checkpoint( @@ -212,6 +214,9 @@ def from_local_checkpoint( "new_embeddings.bin") with open(lora_config_path) as f: config = json.load(f) + + config["vllm_max_position_embeddings"] = max_position_embeddings + peft_helper = PEFTHelper.from_dict(config) if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} # Find unexpected modules. @@ -242,7 +247,7 @@ def from_local_checkpoint( # When a bin file is provided, we rely on config to find unexpected # modules. unexpected_modules = [] - target_modules = config["target_modules"] + target_modules = peft_helper.target_modules if not isinstance(target_modules, list): target_modules = [target_modules] for module in target_modules: @@ -256,7 +261,7 @@ def from_local_checkpoint( # https://github.com/vllm-project/vllm/pull/5909. But there's no # other better mechanism. if unexpected_modules and not is_regex_target_modules( - config["target_modules"], expected_lora_modules): + peft_helper.target_modules, expected_lora_modules): raise ValueError( f"While loading {lora_dir}, expected" f" target modules in {expected_lora_modules}" @@ -274,30 +279,17 @@ def from_local_checkpoint( embeddings = torch.load(new_embeddings_bin_file_path, map_location=device) - rank = config["r"] - lora_alpha = config["lora_alpha"] - context_length = config.get("context_length", None) - scaling_factor = None - if context_length: - if max_position_embeddings is None: - max_position_embeddings = context_length - scaling_factor = float( - math.ceil(context_length / max_position_embeddings)) - return cls.from_lora_tensors( lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, - rank=rank, - lora_alpha=lora_alpha, tensors=tensors, + peft_helper=peft_helper, device=device, dtype=dtype, embeddings=embeddings, target_embedding_padding=target_embedding_padding, - scaling_factor=scaling_factor, embedding_modules=embedding_modules, - embedding_padding_modules=embedding_padding_modules, - ) + embedding_padding_modules=embedding_padding_modules) class LoRAModelManager(AdapterModelManager): diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py new file mode 100644 index 0000000000000..edf4ba5659575 --- /dev/null +++ b/vllm/lora/peft_helper.py @@ -0,0 +1,70 @@ +# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py + +import math +from dataclasses import MISSING, dataclass, field, fields +from typing import Literal, Optional, Union + + +@dataclass +class PEFTHelper: + # Required fields + r: int + lora_alpha: int + target_modules: Union[list[str], str] + + bias: Literal["none", "all", "lora_only"] = field(default="none") + modules_to_save: Optional[list[str]] = field(default=None) + use_rslora: bool = field(default=False) + use_dora: bool = field(default=False) + # long lora field + context_length: int = field(default=0) + # Extra vllm field, start with 'vllm_' to avoid conflict + vllm_max_position_embeddings: Optional[int] = field(default=False) + vllm_scaling_factor: Optional[float] = field(default=None) + + def _validate_features(self): + error_msg = [] + + if self.modules_to_save: + error_msg.append("vLLM only supports modules_to_save being None.") + if self.use_rslora: + error_msg.append("vLLM does not yet support RSLoRA.") + + if self.use_dora: + error_msg.append("vLLM does not yet support DoRA.") + + if error_msg: + raise ValueError(f"{', '.join(error_msg)}") + + def __post_init__(self): + self._validate_features() + if self.context_length: + if self.vllm_max_position_embeddings is None: + self.vllm_max_position_embeddings = self.context_length + self.vllm_scaling_factor = float( + math.ceil(self.context_length / + self.vllm_max_position_embeddings)) + + @classmethod + def from_dict(cls, config_dict: dict) -> "PEFTHelper": + # Get all field information from the class + class_fields = {f.name: f for f in fields(cls)} + # Check for required fields + required_fields = { + name + for name, f in class_fields.items() + if f.default is MISSING and f.default_factory is MISSING + } + + # Identify any missing required fields + missing_fields = required_fields - set(config_dict.keys()) + if missing_fields: + raise ValueError( + f"Missing required configuration fields: {missing_fields}") + + # Filter out fields that aren't defined in the class + filtered_dict = { + k: v + for k, v in config_dict.items() if k in class_fields + } + return cls(**filtered_dict) From 9b9cef3145381721fa950c89718fe71849ac2a55 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 10 Dec 2024 09:38:23 -0700 Subject: [PATCH 003/357] [Bugfix] Backport request id validation to v0 (#11036) Signed-off-by: Joe Runde --- vllm/engine/multiprocessing/client.py | 4 ++++ vllm/v1/engine/async_llm.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 32bd83305bb8f..a729023bc00bb 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -576,6 +576,10 @@ async def _process_request( if self._errored_with is not None: raise ENGINE_DEAD_ERROR(self._errored_with) + # Ensure the request id is unique among running requests + if request_id in self.output_queues: + raise ValueError(f"Request {request_id} already exists") + # Constructing guided decoding logits processors is expensive, so we do # it here to avoid contending with cpu resources and the GIL on the # backend process. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 26fd650aee4b7..24cafeff63d1e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -152,7 +152,7 @@ async def add_request( """Add new request to the AsyncLLM.""" if self.detokenizer.is_request_active(request_id): - raise KeyError(f"Request {request_id} already exists.") + raise ValueError(f"Request {request_id} already exists.") # 1) Create a new AsyncStream for the request. stream = self._add_request_to_streams(request_id) From 250ee65d72a0c7b86ec5cea9cbe9377da21d6439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1via=20B=C3=A9o?= <119421251+flaviabeo@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:38:15 -0300 Subject: [PATCH 004/357] [BUG] Remove token param #10921 (#11022) Signed-off-by: Flavia Beo --- vllm/transformers_utils/config.py | 63 ++++++++++++++----------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3da99bcbee9ae..4529cf27ef565 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,5 +1,6 @@ import enum import json +import os from pathlib import Path from typing import Any, Dict, Optional, Type, Union @@ -41,6 +42,7 @@ from transformers import AutoConfig MISTRAL_CONFIG_NAME = "params.json" +HF_TOKEN = os.getenv('HF_TOKEN', None) logger = init_logger(__name__) @@ -77,8 +79,8 @@ class ConfigFormat(str, enum.Enum): MISTRAL = "mistral" -def file_or_path_exists(model: Union[str, Path], config_name, revision, - token) -> bool: +def file_or_path_exists(model: Union[str, Path], config_name: str, + revision: Optional[str]) -> bool: if Path(model).exists(): return (Path(model) / config_name).is_file() @@ -93,7 +95,10 @@ def file_or_path_exists(model: Union[str, Path], config_name, revision, # NB: file_exists will only check for the existence of the config file on # hf_hub. This will fail in offline mode. try: - return file_exists(model, config_name, revision=revision, token=token) + return file_exists(model, + config_name, + revision=revision, + token=HF_TOKEN) except huggingface_hub.errors.OfflineModeIsEnabled: # Don't raise in offline mode, all we know is that we don't have this # file cached. @@ -161,7 +166,6 @@ def get_config( revision: Optional[str] = None, code_revision: Optional[str] = None, config_format: ConfigFormat = ConfigFormat.AUTO, - token: Optional[str] = None, **kwargs, ) -> PretrainedConfig: # Separate model folder from file path for GGUF models @@ -173,19 +177,20 @@ def get_config( if config_format == ConfigFormat.AUTO: if is_gguf or file_or_path_exists( - model, HF_CONFIG_NAME, revision=revision, token=token): + model, HF_CONFIG_NAME, revision=revision): config_format = ConfigFormat.HF - elif file_or_path_exists(model, - MISTRAL_CONFIG_NAME, - revision=revision, - token=token): + elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, + revision=revision): config_format = ConfigFormat.MISTRAL else: # If we're in offline mode and found no valid config format, then # raise an offline mode error to indicate to the user that they # don't have files cached and may need to go online. # This is conveniently triggered by calling file_exists(). - file_exists(model, HF_CONFIG_NAME, revision=revision, token=token) + file_exists(model, + HF_CONFIG_NAME, + revision=revision, + token=HF_TOKEN) raise ValueError(f"No supported config format found in {model}") @@ -194,7 +199,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=token, + token=HF_TOKEN, **kwargs, ) @@ -206,7 +211,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=token, + token=HF_TOKEN, **kwargs, ) else: @@ -216,7 +221,7 @@ def get_config( trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, - token=token, + token=HF_TOKEN, **kwargs, ) except ValueError as e: @@ -234,7 +239,7 @@ def get_config( raise e elif config_format == ConfigFormat.MISTRAL: - config = load_params_config(model, revision, token=token, **kwargs) + config = load_params_config(model, revision, token=HF_TOKEN, **kwargs) else: raise ValueError(f"Unsupported config format: {config_format}") @@ -256,8 +261,7 @@ def get_config( def get_hf_file_to_dict(file_name: str, model: Union[str, Path], - revision: Optional[str] = 'main', - token: Optional[str] = None): + revision: Optional[str] = 'main'): """ Downloads a file from the Hugging Face Hub and returns its contents as a dictionary. @@ -266,7 +270,6 @@ def get_hf_file_to_dict(file_name: str, - file_name (str): The name of the file to download. - model (str): The name of the model on the Hugging Face Hub. - revision (str): The specific version of the model. - - token (str): The Hugging Face authentication token. Returns: - config_dict (dict): A dictionary containing @@ -276,8 +279,7 @@ def get_hf_file_to_dict(file_name: str, if file_or_path_exists(model=model, config_name=file_name, - revision=revision, - token=token): + revision=revision): if not file_path.is_file(): try: @@ -296,9 +298,7 @@ def get_hf_file_to_dict(file_name: str, return None -def get_pooling_config(model: str, - revision: Optional[str] = 'main', - token: Optional[str] = None): +def get_pooling_config(model: str, revision: Optional[str] = 'main'): """ This function gets the pooling and normalize config from the model - only applies to @@ -315,8 +315,7 @@ def get_pooling_config(model: str, """ modules_file_name = "modules.json" - modules_dict = get_hf_file_to_dict(modules_file_name, model, revision, - token) + modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) if modules_dict is None: return None @@ -332,8 +331,7 @@ def get_pooling_config(model: str, if pooling: pooling_file_name = "{}/config.json".format(pooling["path"]) - pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision, - token) + pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) pooling_type_name = next( (item for item, val in pooling_dict.items() if val is True), None) @@ -368,8 +366,8 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]: def get_sentence_transformer_tokenizer_config(model: str, - revision: Optional[str] = 'main', - token: Optional[str] = None): + revision: Optional[str] = 'main' + ): """ Returns the tokenization configuration dictionary for a given Sentence Transformer BERT model. @@ -379,7 +377,6 @@ def get_sentence_transformer_tokenizer_config(model: str, BERT model. - revision (str, optional): The revision of the m odel to use. Defaults to 'main'. - - token (str): A Hugging Face access token. Returns: - dict: A dictionary containing the configuration parameters @@ -394,7 +391,7 @@ def get_sentence_transformer_tokenizer_config(model: str, "sentence_xlm-roberta_config.json", "sentence_xlnet_config.json", ]: - encoder_dict = get_hf_file_to_dict(config_name, model, revision, token) + encoder_dict = get_hf_file_to_dict(config_name, model, revision) if encoder_dict: break @@ -474,16 +471,14 @@ def _reduce_config(config: VllmConfig): exc_info=e) -def load_params_config(model: Union[str, Path], - revision: Optional[str], - token: Optional[str] = None, +def load_params_config(model: Union[str, Path], revision: Optional[str], **kwargs) -> PretrainedConfig: # This function loads a params.json config which # should be used when loading models in mistral format config_file_name = "params.json" - config_dict = get_hf_file_to_dict(config_file_name, model, revision, token) + config_dict = get_hf_file_to_dict(config_file_name, model, revision) assert isinstance(config_dict, dict) config_mapping = { From e7391949267a4eff3d84f02119f442f46b16d163 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 10 Dec 2024 15:08:16 -0500 Subject: [PATCH 005/357] [Core] Update to outlines >= 0.1.8 (#10576) Signed-off-by: Russell Bryant --- requirements-common.txt | 2 +- .../guided_decoding/outlines_logits_processors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 112528880c0ac..c71fc458aca13 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -18,7 +18,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines >= 0.0.43, < 0.1 +outlines >= 0.1.8 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e1309c31f77e7..1f0dbe024609d 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -99,7 +99,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor): def _get_guide(cls, regex_string: str, tokenizer: PreTrainedTokenizerBase) -> Guide: tokenizer = _adapt_tokenizer(tokenizer) - return RegexGuide(regex_string, tokenizer) + return RegexGuide.from_regex(regex_string, tokenizer) def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase): """Compile the FSM that drives the regex-structured generation. From 75f89dc44c6e44cc28bae59d5b40a588735b507b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 10 Dec 2024 12:40:52 -0800 Subject: [PATCH 006/357] [torch.compile] add a flag to track batchsize statistics (#11059) Signed-off-by: youkaichao --- vllm/envs.py | 3 +++ vllm/forward_context.py | 32 +++++++++++++++++++++++- vllm/v1/attention/backends/flash_attn.py | 1 + vllm/v1/worker/gpu_model_runner.py | 2 ++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index ab12a7b48dc53..be5d9985b63a4 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -69,6 +69,7 @@ VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False VLLM_ENABLE_V1_MULTIPROCESSING: bool = False + VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 def get_default_cache_root(): @@ -452,6 +453,8 @@ def get_default_config_root(): # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))), + "VLLM_LOG_BATCHSIZE_INTERVAL": + lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), } # end-env-vars-definition diff --git a/vllm/forward_context.py b/vllm/forward_context.py index aaa3e4bb3a1e8..cd136f43c0c57 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,8 +1,19 @@ +import time +from collections import Counter from contextlib import contextmanager from dataclasses import dataclass from typing import Any, Dict, Optional +import vllm.envs as envs from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + +track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 +batchsize_counter: Counter = Counter() +last_logging_time: float = 0 +batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL @dataclass @@ -26,7 +37,26 @@ def get_forward_context() -> ForwardContext: @contextmanager def set_forward_context(context: Any, vllm_config: VllmConfig): """A context manager that stores the current forward context, - can be attention metadata, etc.""" + can be attention metadata, etc. + Here we can inject common logic for every model forward pass. + """ + global track_batchsize, batchsize_counter + global last_logging_time, batchsize_logging_interval + if track_batchsize and context is not None: + if hasattr(context, "num_prefill_tokens"): + # for v0 attention backends + batchsize = context.num_prefill_tokens + context.num_decode_tokens + else: + # for v1 attention backends + batchsize = context.num_input_tokens + batchsize_counter[batchsize] += 1 + if time.monotonic() - last_logging_time > batchsize_logging_interval: + last_logging_time = time.monotonic() + sorted_data = sorted(batchsize_counter.items(), + key=lambda x: x[1], + reverse=True) + logger.info("Batchsize distribution (batchsize, count): %s", + sorted_data) global _forward_context prev_context = _forward_context _forward_context = ForwardContext( diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 251a103e60f06..c9f04ace644c7 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -56,6 +56,7 @@ class FlashAttentionMetadata: seq_start_loc: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor + num_input_tokens: int = 0 # Number of tokens including padding. class FlashAttentionImpl(AttentionImpl): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0a5adfb28c9bd..a3335fa838352 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -445,6 +445,8 @@ def execute_model( # Eager mode. num_input_tokens = num_scheduled_tokens + attn_metadata.num_input_tokens = num_input_tokens + # Get the inputs embeds. if encoder_outputs: inputs_embeds = self.model.get_input_embeddings( From 134810b3d9a05510622282479f0f9e2114b88017 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 10 Dec 2024 14:41:23 -0800 Subject: [PATCH 007/357] [V1][Bugfix] Always set enable_chunked_prefill = True for V1 (#11061) Signed-off-by: Woosuk Kwon --- vllm/engine/arg_utils.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3db069ec64ee4..7b9adc401abcf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -122,7 +122,7 @@ class EngineArgs: cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None - max_num_seqs: int = 256 + max_num_seqs: Optional[int] = None max_logprobs: int = 20 # Default value for OpenAI Chat Completions API disable_log_stats: bool = False revision: Optional[str] = None @@ -205,6 +205,9 @@ def __post_init__(self): # by user. if self.enable_prefix_caching is None: self.enable_prefix_caching = bool(envs.VLLM_USE_V1) + # Override max_num_seqs if it's not set by user. + if self.max_num_seqs is None: + self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024 # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -1225,19 +1228,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None: """ assert envs.VLLM_USE_V1, "V1 is not enabled" + # V1 always uses chunked prefills. + self.enable_chunked_prefill = True + # When no user override, set the default values based on the usage + # context. + # TODO(woosuk): Tune the default values for different hardware. if self.max_num_batched_tokens is None: - # When no user override, set the default values based on the - # usage context. if usage_context == UsageContext.LLM_CLASS: - logger.warning("Setting max_num_batched_tokens to 8192 " - "for LLM_CLASS usage context.") - self.max_num_seqs = 1024 self.max_num_batched_tokens = 8192 elif usage_context == UsageContext.OPENAI_API_SERVER: - logger.warning("Setting max_num_batched_tokens to 2048 " - "for OPENAI_API_SERVER usage context.") - self.max_num_seqs = 1024 self.max_num_batched_tokens = 2048 + logger.warning( + "Setting max_num_batched_tokens to %d for %s usage context.", + self.max_num_batched_tokens, usage_context.value) def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: """ From 9a93973708d7f52f1d1439f8f32b8c1514d18b86 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 10 Dec 2024 19:16:22 -0500 Subject: [PATCH 008/357] [Bugfix] Fix Mamba multistep (#11071) Signed-off-by: Tyler Michael Smith --- vllm/attention/backends/placeholder_attn.py | 64 ++++++++++++++++++++- vllm/worker/multi_step_model_runner.py | 4 +- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 888adbffb8578..658039bfc3365 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -11,7 +11,8 @@ from vllm.multimodal import MultiModalPlaceholderMap if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUBuilder + from vllm.worker.model_runner import (ModelInputForGPUBuilder, + ModelInputForGPUWithSamplingMetadata) # Placeholder attention backend for models like Mamba and embedding models that # lack attention. @@ -186,6 +187,67 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: ) return self._cached_decode_metadata + def advance_step(self, + model_input: "ModelInputForGPUWithSamplingMetadata", + sampled_token_ids: Optional[torch.Tensor], + block_size: int, + num_seqs: int, + num_queries: int, + turn_prefills_into_decodes: bool = False): + """ + Update metadata in-place to advance one decode step. + """ + # When using cudagraph, the num_seqs is padded to the next captured + # batch sized, but num_queries tracks the actual number of requests in + # the batch. For --enforce-eager mode, num_seqs == num_queries + if num_seqs != num_queries: + assert num_seqs > num_queries + assert self.use_cuda_graph + + assert not turn_prefills_into_decodes, \ + ("Multi-Step + Chunked-Prefill is not supported for attention-free" + "models. turn_prefills_into_decodes is a " + "Multi-Step + Chunked-Prefill specific parameter.") + + assert self.seq_lens is not None + assert self.max_decode_seq_len == max(self.seq_lens) + + assert self.num_prefills == 0 + assert self.num_prefill_tokens == 0 + assert self.num_decode_tokens == num_seqs + + assert self.seq_lens is not None + assert len(self.seq_lens) == num_seqs + assert self.seq_lens_tensor is not None + assert self.seq_lens_tensor.shape == (num_seqs, ) + assert self.max_query_len == 1 + assert self.max_prefill_seq_len == 0 + + assert self.query_start_loc is not None + assert self.query_start_loc.shape == (num_queries + 1, ) + assert self.seq_start_loc is not None + assert self.seq_start_loc.shape == (num_seqs + 1, ) + + assert self.context_lens_tensor is not None + assert self.context_lens_tensor.shape == (num_queries, ) + + assert self.block_tables is not None + + # Update query lengths. Note that we update only queries and not seqs, + # since tensors may be padded due to captured cuda graph batch size + for i in range(num_queries): + self.seq_lens[i] += 1 + self.max_decode_seq_len = max(self.seq_lens) + + # Update sequences, masking off entries greater than num_queries + device = self.seq_lens_tensor.device + mask = torch.arange(self.seq_lens_tensor.size(0), + device=device) < num_queries + self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype) + if sampled_token_ids is not None: + model_input.input_tokens.masked_scatter_( + mask, sampled_token_ids[:num_queries]) + class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 3ca0d88a42183..e08a61e31fe42 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -29,7 +29,9 @@ logger = init_logger(__name__) -MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"] +MULTI_STEP_ATTENTION_BACKENDS = [ + "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION" +] MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ From d5c5154fcf4c5d65551c98e458cbb027e5f4b672 Mon Sep 17 00:00:00 2001 From: Aurick Qiao Date: Tue, 10 Dec 2024 21:09:20 -0500 Subject: [PATCH 009/357] [Misc] LoRA + Chunked Prefill (#9057) --- tests/lora/test_chatglm3_tp.py | 9 ++++++--- tests/lora/test_gemma.py | 3 ++- tests/lora/test_llama_tp.py | 6 +++++- tests/lora/test_long_context.py | 3 ++- tests/lora/test_minicpmv.py | 3 ++- tests/lora/test_minicpmv_tp.py | 2 ++ tests/lora/test_mixtral.py | 1 + tests/lora/test_phi.py | 3 ++- tests/lora/test_quant_model.py | 9 ++++++--- vllm/config.py | 3 ++- vllm/core/scheduler.py | 15 ++++++++++++--- vllm/worker/model_runner.py | 12 +++++++----- 12 files changed, 49 insertions(+), 20 deletions(-) diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index f17464573459f..49a527b99ac16 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -53,7 +53,8 @@ def test_chatglm3_lora(chatglm3_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=1, - trust_remote_code=True) + trust_remote_code=True, + enable_chunked_prefill=True) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): @@ -73,7 +74,8 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): max_lora_rank=64, tensor_parallel_size=4, trust_remote_code=True, - fully_sharded_loras=False) + fully_sharded_loras=False, + enable_chunked_prefill=True) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): @@ -93,7 +95,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): max_lora_rank=64, tensor_parallel_size=4, trust_remote_code=True, - fully_sharded_loras=True) + fully_sharded_loras=True, + enable_chunked_prefill=True) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): assert output1[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 15ec66b0f5502..5ae705e474ec6 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -37,7 +37,8 @@ def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, - max_loras=4) + max_loras=4, + enable_chunked_prefill=True) expected_lora_output = [ "more important than knowledge.\nAuthor: Albert Einstein\n", diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index d3ca7f878191a..dfeac380951d8 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -78,7 +78,8 @@ def test_llama_lora(sql_lora_files): enable_lora=True, max_num_seqs=16, max_loras=4, - tensor_parallel_size=1) + tensor_parallel_size=1, + enable_chunked_prefill=True) generate_and_test(llm, sql_lora_files) @@ -120,6 +121,7 @@ def test_llama_lora_tp4(sql_lora_files): max_num_seqs=16, max_loras=4, tensor_parallel_size=4, + enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) @@ -135,6 +137,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): max_loras=4, tensor_parallel_size=4, fully_sharded_loras=True, + enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) @@ -151,5 +154,6 @@ def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): tensor_parallel_size=4, fully_sharded_loras=True, enable_lora_bias=True, + enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index eada902c891f7..e7a34f2ced7ed 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -124,7 +124,8 @@ def lora_llm(long_context_infos): tensor_parallel_size=4, # FIXME enable async output processor disable_async_output_proc=True, - distributed_executor_backend="mp") + distributed_executor_backend="mp", + enable_chunked_prefill=True) yield llm del llm diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py index 2c45ce5141f7d..1f3de9edc0d0f 100644 --- a/tests/lora/test_minicpmv.py +++ b/tests/lora/test_minicpmv.py @@ -67,7 +67,8 @@ def test_minicpmv_lora(minicpmv_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, - gpu_memory_utilization=0.97 # This model is pretty big for CI gpus + gpu_memory_utilization=0.97, # This model is pretty big for CI gpus + enable_chunked_prefill=True, ) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index ba29e562e58ec..930f177953a5f 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -69,6 +69,7 @@ def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded): tensor_parallel_size=2, trust_remote_code=True, fully_sharded_loras=fully_sharded, + enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) @@ -89,6 +90,7 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=fully_sharded, + enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index dddc299da446b..150221dfce6ab 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -47,6 +47,7 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): max_loras=4, distributed_executor_backend="ray", tensor_parallel_size=tp_size, + enable_chunked_prefill=True, ) expected_lora_output = [ diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 733eff48a9bf3..5a3fcb8d690d9 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -53,7 +53,8 @@ def test_phi2_lora(phi2_lora_files): max_model_len=1024, enable_lora=True, max_loras=2, - enforce_eager=True) + enforce_eager=True, + enable_chunked_prefill=True) expected_lora_output = [ "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;", # noqa: E501 diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 5432fa4ad0d3a..026269667b473 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -84,7 +84,8 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, tensor_parallel_size=tp_size, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, - trust_remote_code=True) + trust_remote_code=True, + enable_chunked_prefill=True) if model.quantization is None: expected_no_lora_output = [ @@ -176,7 +177,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, tensor_parallel_size=1, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, - trust_remote_code=True) + trust_remote_code=True, + enable_chunked_prefill=True) output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) del llm_tp1 @@ -189,7 +191,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, max_loras=4, tensor_parallel_size=2, gpu_memory_utilization=0.2, #avoid OOM - quantization=model.quantization) + quantization=model.quantization, + enable_chunked_prefill=True) output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) del llm_tp2 diff --git a/vllm/config.py b/vllm/config.py index 5fb9563fcf3a3..c66ddbb47f22e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1698,7 +1698,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: - raise ValueError("LoRA is not supported with chunked prefill yet.") + logger.warning("LoRA with chunked prefill is still experimental " + "and may be unstable.") @dataclass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index d23009dae01ee..94c62743883ec 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -166,9 +166,18 @@ def is_empty(self) -> bool: and not self.blocks_to_swap_out and not self.blocks_to_copy) def _sort_by_lora_ids(self): - self.scheduled_seq_groups = sorted( - self.scheduled_seq_groups, - key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id)) + assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups) + + def key_fn(group: ScheduledSequenceGroup): + key = (group.seq_group.lora_int_id, group.seq_group.request_id) + if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups): + # Sort sequence groups so that all prefills come before all + # decodes as required by chunked prefill. + return (not group.seq_group.is_prefill(), *key) + return key + + self.scheduled_seq_groups = sorted(self.scheduled_seq_groups, + key=key_fn) @property def lora_requests(self) -> Set[LoRARequest]: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1bc5f65c7127f..551b84435fdc0 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -622,11 +622,13 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup, inter_data.lora_requests.add(seq_group_metadata.lora_request) query_len = inter_data.query_lens[seq_idx] inter_data.lora_index_mapping.append([lora_id] * query_len) - inter_data.lora_prompt_mapping.append( - [lora_id] * - (query_len if seq_group_metadata.sampling_params - and seq_group_metadata.sampling_params.prompt_logprobs is not None - else 1)) + sampling_params = seq_group_metadata.sampling_params + if sampling_params and sampling_params.prompt_logprobs is not None: + inter_data.lora_prompt_mapping.append([lora_id] * query_len) + elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample: + inter_data.lora_prompt_mapping.append([lora_id]) + else: + inter_data.lora_prompt_mapping.append([]) def _compute_prompt_adapter_input( self, inter_data: InterDataForSeqGroup, From ffa48c9146fda1e8810d1cfa159e1d70aadae6c6 Mon Sep 17 00:00:00 2001 From: Mor Zusman Date: Wed, 11 Dec 2024 04:53:37 +0200 Subject: [PATCH 010/357] [Model] PP support for Mamba-like models (#10992) Signed-off-by: mzusman --- docs/source/models/supported_models.rst | 6 +- tests/distributed/test_pipeline_parallel.py | 6 +- vllm/config.py | 58 +++++++++---- vllm/model_executor/models/interfaces.py | 37 ++++++++ vllm/model_executor/models/jamba.py | 93 ++++++++++++++------- vllm/model_executor/models/mamba.py | 68 ++++++++++----- vllm/model_executor/models/registry.py | 11 ++- vllm/utils.py | 5 ++ vllm/v1/worker/gpu_model_runner.py | 8 +- vllm/v1/worker/gpu_worker.py | 6 +- vllm/worker/cache_engine.py | 12 +-- 11 files changed, 229 insertions(+), 81 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 4e5b10967e3bb..6540e023c1ab0 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -128,7 +128,7 @@ Text Generation - FalconMamba - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. - ✅︎ - - + - ✅︎ * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. @@ -193,7 +193,7 @@ Text Generation - Jamba - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. - ✅︎ - - + - ✅︎ * - :code:`LlamaForCausalLM` - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. @@ -203,7 +203,7 @@ Text Generation - Mamba - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. - - - + - ✅︎ * - :code:`MiniCPMForCausalLM` - MiniCPM - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index b818ca921fcb0..85d408efafe96 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -156,13 +156,13 @@ def iter_params(self, model_name: str): # "internlm/internlm-chat-7b": PPTestSettings.fast(), "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True), "inceptionai/jais-13b-chat": PPTestSettings.fast(), - # TODO: Implement PP - # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(), + "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True), # Uses Llama # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(), + "state-spaces/mamba-130m-hf": PPTestSettings.fast(), "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4), "mosaicml/mpt-7b": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(), @@ -234,6 +234,8 @@ def iter_params(self, model_name: str): "OpenGVLab/InternVL2-1B", "microsoft/Phi-3-vision-128k-instruct", "fixie-ai/ultravox-v0_3", + # [LANGUAGE GENERATION - HYBRID ARCH] + "ai21labs/Jamba-tiny-dev", ] diff --git a/vllm/config.py b/vllm/config.py index c66ddbb47f22e..2a9f0ebae997d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,8 +27,8 @@ ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) -from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - print_warning_once, random_uuid, +from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, + get_cpu_memory, print_warning_once, random_uuid, resolve_obj_by_qualname) if TYPE_CHECKING: @@ -284,6 +284,7 @@ def __init__( self._verify_tokenizer_mode() self.is_attention_free = self._init_attention_free() + self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() if current_platform.is_neuron(): @@ -340,6 +341,10 @@ def _init_attention_free(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.is_attention_free_model(architectures) + def _init_is_hybrid(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_hybrid_model(architectures) + def _init_has_inner_state(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.model_has_inner_state(architectures) @@ -669,26 +674,51 @@ def get_num_attention_heads(self, num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) return num_heads // parallel_config.tensor_parallel_size - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + def get_layers_start_end_indices( + self, parallel_config: "ParallelConfig") -> Tuple[int, int]: from vllm.distributed.utils import get_pp_indices total_num_hidden_layers = getattr(self.hf_text_config, "num_hidden_layers", 0) pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size pp_size = parallel_config.pipeline_parallel_size start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) - return end - start - - def get_num_attention_layers(self, - parallel_config: "ParallelConfig") -> int: - if self.is_attention_free: - return 0 + return start, end - num_layers = self.get_num_layers(parallel_config) + def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + start, end = self.get_layers_start_end_indices(parallel_config) + return end - start - # Transformers supports layers_block_type @property - layers = getattr(self.hf_config, "layers_block_type", - ["attention"] * num_layers) - return len([t for t in layers if t == "attention"]) + def get_num_layers_by_block_type( + self, + parallel_config: "ParallelConfig", + block_type: LayerBlockType = LayerBlockType.attention, + ) -> int: + # This function relies on 'layers_block_type' in hf_config, + # for w/o this attribute, we will need to have workarounds like so + attn_block_type = block_type == LayerBlockType.attention + is_transformer = not self.is_hybrid and not self.is_attention_free + start, end = self.get_layers_start_end_indices(parallel_config) + + if is_transformer: + # Handle the basic case first + return end - start if attn_block_type else 0 + elif self.is_attention_free: + # Attention free + # Note that this code assumes there + # is only one type of attention-free block type. + return 0 if attn_block_type else end - start + else: + # Hybrid model + layers_block_type_value = getattr(self.hf_config, + "layers_block_type", None) + if layers_block_type_value is None: + raise ValueError("The model is an hybrid without a" + "layers_block_type in the hf_config," + "cannot determine the num of " + f"{block_type.value} layers") + + return sum(t == block_type.value + for t in layers_block_type_value[start:end]) def get_multimodal_config(self) -> "MultiModalConfig": """ diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c3979eab905db..70b78fe64f2d8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -363,6 +363,43 @@ def is_attention_free( return isinstance(model, IsAttentionFree) +@runtime_checkable +class IsHybrid(Protocol): + """The interface required for all models like Jamba that have both + attention and mamba blocks, indicates that + hf_config has 'layers_block_type'""" + + is_hybrid: ClassVar[Literal[True]] = True + """ + A flag that indicates this model has both mamba and attention blocks + , also indicates that the model's hf_config has + 'layers_block_type' """ + + +@runtime_checkable +class _IsHybridType(Protocol): + is_hybrid: ClassVar[Literal[True]] + + +@overload +def is_hybrid(model: object) -> TypeIs[IsHybrid]: + ... + + +@overload +def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: + ... + + +def is_hybrid( + model: Union[Type[object], object] +) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: + if isinstance(model, type): + return isinstance(model, _IsHybridType) + + return isinstance(model, IsHybrid) + + @runtime_checkable class SupportsCrossEncoding(Protocol): """The interface required for all models that support cross encoding.""" diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 5d5e8ae1ee532..6bb4c13ab35df 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -9,6 +9,7 @@ from vllm.attention.layer import Attention from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -25,9 +26,12 @@ MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.utils import LayerBlockType -from .interfaces import HasInnerState, SupportsLoRA -from .utils import maybe_prefix +from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP +from .utils import (is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -281,16 +285,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=config.vocab_size, ) - decoder_layers = [] - for i in range(config.num_hidden_layers): - layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]] - decoder_layers.append( - layer_class(config, - layer_idx=i, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.layers.{i}")) - self.layers = nn.ModuleList(decoder_layers) + def get_layer(prefix: str): + layer_idx = int(prefix.rsplit(".", 1)[1]) + layer_class = ALL_DECODER_LAYER_TYPES[ + config.layers_block_type[layer_idx]] + return layer_class( + config, + layer_idx, + cache_config, + quant_config=quant_config, + prefix=prefix, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -304,26 +316,34 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, mamba_cache_params: MambaCacheParams, + intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - if inputs_embeds is not None: - hidden_states = inputs_embeds + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - for i in range(len(self.layers)): + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + kv_cache_index = 0 + mamba_cache_index = 0 + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] kv_cache = None layer_mamba_cache_params = None if isinstance(layer, JambaAttentionDecoderLayer): - kv_cache = kv_caches[(i - self.config.attn_layer_offset) // - self.config.attn_layer_period] + kv_cache = kv_caches[kv_cache_index] + kv_cache_index += 1 if isinstance(layer, JambaMambaDecoderLayer): - current_state_layer = i - (1 + - (i - self.config.attn_layer_offset) - // self.config.attn_layer_period) + current_state_layer = mamba_cache_index layer_mamba_cache_params = mamba_cache_params.at_layer_idx( current_state_layer) + mamba_cache_index += 1 hidden_states, residual = layer( positions=positions, @@ -332,11 +352,17 @@ def forward( attn_metadata=attn_metadata, residual=residual, mamba_cache_params=layer_mamba_cache_params) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) hidden_states, _ = self.final_layernorm(hidden_states, residual) return hidden_states -class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA): +class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, + IsHybrid): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -368,6 +394,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config self.scheduler_config = scheduler_config self.model = JambaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -390,6 +418,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -406,10 +437,8 @@ def forward(self, self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) - layers_type = self.config.layers_block_type - num_mamba_layers = sum( - [layer_type == "mamba" for layer_type in layers_type]) - + num_mamba_layers = self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( self.lm_head.weight.dtype, num_mamba_layers, max_batch_size, *self._get_mamba_cache_shape()) @@ -423,7 +452,7 @@ def forward(self, state_indices_tensor) hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, mamba_cache_params, - inputs_embeds) + intermediate_tensors, inputs_embeds) return hidden_states def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): @@ -504,8 +533,12 @@ def load_weights(self, weights: Iterable[Tuple[str, continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -520,6 +553,8 @@ def load_weights(self, weights: Iterable[Tuple[str, if weight_name not in name: continue + if is_pp_missing_parameter(name, self): + continue name = name.replace(weight_name, param_name) param = params_dict[name] weight_loader = param.weight_loader @@ -533,6 +568,8 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 8bdcd2c5aad1f..1f5cd02711899 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -8,6 +8,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer @@ -18,13 +19,16 @@ DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, - IsAttentionFree) + IsAttentionFree, SupportsPP) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.utils import LayerBlockType -from .utils import maybe_prefix +from .utils import (is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -95,15 +99,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=config.vocab_size, ) - decoder_layers = [] - for i in range(config.num_hidden_layers): - decoder_layers.append( - MambaDecoderLayer(config, - cache_config=cache_config, - quant_config=quant_config)) - self.layers = nn.ModuleList(decoder_layers) + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MambaDecoderLayer( + config, cache_config=cache_config, quant_config=quant_config), + prefix=f"{prefix}.layers") + self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embeddings(input_ids) @@ -114,29 +120,40 @@ def forward( positions: torch.Tensor, attn_metadata: AttentionMetadata, mamba_cache_params: MambaCacheParams, + intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - - if inputs_embeds is not None: - hidden_states = inputs_embeds + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] - for i in range(len(self.layers)): + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, attn_metadata=attn_metadata, residual=residual, - mamba_cache_params=mamba_cache_params.at_layer_idx(i)) + mamba_cache_params=mamba_cache_params.at_layer_idx( + i - self.start_layer)) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) hidden_states, _ = self.norm_f(hidden_states, residual) return hidden_states -class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree): +class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config @@ -148,7 +165,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = config + self.vllm_config = vllm_config self.scheduler_config = scheduler_config + self.model_config = vllm_config.model_config self.backbone = MambaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "backbone")) self.unpadded_vocab_size = config.vocab_size @@ -174,6 +193,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.backbone.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.backbone.get_input_embeddings(input_ids) @@ -189,9 +211,12 @@ def forward(self, max_batch_size = (VllmConfig.get_graph_batch_size( self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) + + num_mamba_layers = self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( - self.lm_head.weight.dtype, self.config.num_hidden_layers, - max_batch_size, *self._get_mamba_cache_shape()) + self.lm_head.weight.dtype, num_mamba_layers, max_batch_size, + *self._get_mamba_cache_shape()) ( mamba_cache_tensors, @@ -204,7 +229,8 @@ def forward(self, state_indices_tensor) hidden_states = self.backbone(input_ids, positions, attn_metadata, - mamba_cache_params, inputs_embeds) + mamba_cache_params, intermediate_tensors, + inputs_embeds) return hidden_states @@ -252,6 +278,8 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index e69596aa915b5..4beea4641f5ab 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -21,7 +21,7 @@ from vllm.platforms import current_platform from .adapters import as_embedding_model -from .interfaces import (has_inner_state, is_attention_free, +from .interfaces import (has_inner_state, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_pp) from .interfaces_base import is_pooling_model, is_text_generation_model @@ -218,6 +218,7 @@ class _ModelInfo: supports_pp: bool has_inner_state: bool is_attention_free: bool + is_hybrid: bool @staticmethod def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": @@ -239,6 +240,7 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), is_attention_free=is_attention_free(model), + is_hybrid=is_hybrid(model), ) @@ -484,6 +486,13 @@ def is_attention_free_model( model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free + def is_hybrid_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.is_hybrid + ModelRegistry = _ModelRegistry({ model_arch: _LazyRegisteredModel( diff --git a/vllm/utils.py b/vllm/utils.py index 7cdb2cb320b05..1882264c19775 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -170,6 +170,11 @@ class Device(enum.Enum): CPU = enum.auto() +class LayerBlockType(enum.Enum): + attention = "attention" + mamba = "mamba" + + class Counter: def __init__(self, start: int = 0) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a3335fa838352..8d9976ded7c5e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -15,8 +15,8 @@ from vllm.model_executor.model_loader import get_model from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingType -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, - is_pin_memory_available) +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, + LayerBlockType, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) from vllm.v1.outputs import ModelRunnerOutput @@ -68,8 +68,8 @@ def __init__( self.max_num_tokens = scheduler_config.max_num_batched_tokens # Model-related. - self.num_attn_layers = model_config.get_num_attention_layers( - parallel_config) + self.num_attn_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d32848c3775ae..49e415ab72e0b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -14,7 +14,7 @@ from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size from vllm.v1.core.scheduler import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -260,8 +260,8 @@ def _get_cache_block_size( ) -> int: head_size = model_config.get_head_size() num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_attention_layers( - parallel_config) + num_attention_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) key_cache_block = cache_config.block_size * num_heads * head_size value_cache_block = key_cache_block diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index ac3270d1c9909..7ccd4571b19df 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,8 +6,8 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, - is_pin_memory_available) +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, + get_dtype_size, is_pin_memory_available) logger = init_logger(__name__) @@ -34,8 +34,8 @@ def __init__( self.head_size = model_config.get_head_size() # Models like Jamba, have mixed typed layers, E.g Mamba - self.num_attention_layers = model_config.get_num_attention_layers( - parallel_config) + self.num_attention_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.block_size = cache_config.block_size @@ -105,8 +105,8 @@ def get_cache_block_size( ) -> int: head_size = model_config.get_head_size() num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_attention_layers( - parallel_config) + num_attention_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) key_cache_block = cache_config.block_size * num_heads * head_size value_cache_block = key_cache_block From e39400a4b60d28ff5c0a1a5194068c928adcaf98 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Wed, 11 Dec 2024 01:51:40 -0300 Subject: [PATCH 011/357] Fix streaming for granite tool call when <|tool_call|> is present (#11069) Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 00917c866e496..dae481a2154a1 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -88,7 +88,11 @@ def extract_tool_calls_streaming( ) -> Union[DeltaMessage, None]: start_idx = consume_space(0, current_text) - if not current_text or current_text[start_idx] != '[': + if current_text[start_idx:].startswith(self.bot_token): + start_idx = consume_space(start_idx + len(self.bot_token), + current_text) + if not current_text or start_idx >= len(current_text)\ + or current_text[start_idx] != '[': return DeltaMessage(content=delta_text) # bit mask flags for partial JSON parsing. If the name hasn't been From 2e33fe419186c65a18da6668972d61d7bbc31564 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Dec 2024 13:02:02 +0800 Subject: [PATCH 012/357] [CI/Build] Check transformers v4.47 (#10991) Signed-off-by: DarkLight1337 --- requirements-test.txt | 4 ++-- .../vision_language/mm_processor_kwargs/test_idefics3.py | 9 --------- .../models/embedding/vision_language/test_llava_next.py | 2 +- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 38a064bca449a..8ceb705cdffd7 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -506,7 +506,7 @@ tiktoken==0.7.0 # mistral-common timm==1.0.11 # via -r requirements-test.in -tokenizers==0.20.3 +tokenizers==0.21.0 # via transformers torch==2.5.1 # via @@ -534,7 +534,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.46.3 +transformers==4.47.0 # via # lm-eval # peft diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py index 31896bfd13e8c..c71a2d359043d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py @@ -3,7 +3,6 @@ import pytest import torch -import transformers from transformers import AutoImageProcessor, AutoTokenizer from vllm.inputs import InputContext, token_inputs @@ -36,8 +35,6 @@ def get_max_idefics3_image_tokens(): return get_max_idefics3_image_tokens -@pytest.mark.skipif(transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336]) def test_input_mapper_override(model: str, image_assets: _ImageAssets, @@ -77,8 +74,6 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets, assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) -@pytest.mark.skipif(transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("longest_edge, expected_max_tokens", [ (None, 2873), @@ -107,8 +102,6 @@ def test_max_tokens_override(get_max_idefics3_image_tokens, model: str, assert expected_max_tokens == actual_max_tokens -@pytest.mark.skipif(transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [ (168, 169, 1), @@ -143,8 +136,6 @@ def test_dummy_data_override(dummy_data_for_idefics3, model: str, assert img_tok_count == toks_per_img * num_imgs -@pytest.mark.skipif(transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [ (336, 169 * (1**2 + 1), 1), diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 329c6ba279f89..693abd7252d5e 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -86,7 +86,7 @@ def _run_test( ) -@pytest.mark.skipif(transformers.__version__.startswith("4.46"), +@pytest.mark.skipif(transformers.__version__ >= "4.46", reason="Model broken with changes in transformers 4.46") @pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) From 3fb4b4f1634a896653acc12c72b8e5d6d87a8f82 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 11 Dec 2024 00:39:53 -0800 Subject: [PATCH 013/357] [ci/build] Fix AMD CI dependencies (#11087) --- requirements-rocm.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 121123611d2da..ccc9062341772 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -5,7 +5,8 @@ awscli boto3 botocore +datasets ray >= 2.10.0 peft pytest-asyncio -tensorizer>=2.9.0 \ No newline at end of file +tensorizer>=2.9.0 From 9974fca047bb332ec68377be4579ea515a300d69 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 11 Dec 2024 01:01:53 -0800 Subject: [PATCH 014/357] [ci/build] Fix entrypoints test and pin outlines version (#11088) --- requirements-common.txt | 2 +- .../guided_decoding/outlines_logits_processors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index c71fc458aca13..792cd58e80669 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -18,7 +18,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines >= 0.1.8 +outlines == 0.1.9 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 1f0dbe024609d..b63fed1c8a8c3 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -25,7 +25,7 @@ from outlines import grammars from outlines.caching import cache from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write -from outlines.fsm.json_schema import build_regex_from_schema +from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase From 61b1d2f6aef8e29c6a0d795a9c6682d525f4d8cc Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 11 Dec 2024 04:26:36 -0500 Subject: [PATCH 015/357] [Core] v1: Use atexit to handle engine core client shutdown (#11076) Signed-off-by: Russell Bryant --- vllm/v1/engine/core_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ee89cece73141..4d96b323d1662 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,4 @@ +import atexit import multiprocessing from typing import List, Union @@ -157,6 +158,7 @@ def __init__( should_shutdown=self.should_shutdown, **kwargs, ) + atexit.register(self.shutdown) def shutdown(self): # Send shutdown signal to background process. From 2e32f5d28db3cd79f6a421f640e083be1f9468b7 Mon Sep 17 00:00:00 2001 From: B-201 Date: Wed, 11 Dec 2024 17:27:07 +0800 Subject: [PATCH 016/357] [Bugfix] Fix Idefics3 fails during multi-image inference (#11080) Signed-off-by: B-201 --- vllm/model_executor/models/idefics3.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index e5d2edbd81eb1..17e772e7faa32 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -60,7 +60,8 @@ class Idefics3ImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor """ - Shape: `(batch_size * num_images, num_channels, height, width)` + Shape: `(batch_size * num_images * num_patches, + num_channels, height, width)` """ pixel_attention_mask: Optional[torch.BoolTensor] @@ -520,13 +521,17 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - return Idefics3ImagePixelInputs(type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, - concat=True)), - pixel_attention_mask=flatten_bn( - pixel_attention_mask, - concat=True)) + if isinstance(pixel_values, list): + pixel_values = torch.cat(pixel_values, dim=1) + pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1) + else: + pixel_values = flatten_bn(pixel_values) + pixel_attention_mask = flatten_bn(pixel_attention_mask) + + return Idefics3ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + pixel_attention_mask=pixel_attention_mask) raise AssertionError("This line should be unreachable.") From 40766ca1b8b0ef92e220595bda96c4336b597e5b Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 11 Dec 2024 04:27:39 -0500 Subject: [PATCH 017/357] [Bugfix]: Clamp `-inf` logprob values in prompt_logprobs (#11073) Signed-off-by: Rafael Vasquez --- vllm/entrypoints/openai/serving_completion.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c54d5f07cf58c..ee97d35f2b087 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -392,6 +392,12 @@ def request_output_to_completion_response( prompt_token_ids = final_res.prompt_token_ids assert prompt_token_ids is not None prompt_logprobs = final_res.prompt_logprobs + if prompt_logprobs: + for logprob_dict in prompt_logprobs: + if logprob_dict: + for logprob_values in logprob_dict.values(): + if logprob_values.logprob == float('-inf'): + logprob_values.logprob = -9999.0 prompt_text = final_res.prompt token_ids: GenericSequence[int] From 8f10d5e3930f05c2057a831cd80ba24c52b8ceef Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Dec 2024 17:28:00 +0800 Subject: [PATCH 018/357] [Misc] Split up pooling tasks (#10820) Signed-off-by: DarkLight1337 --- docs/source/index.rst | 2 + docs/source/models/generative_models.rst | 146 ++++++++++++++++ docs/source/models/pooling_models.rst | 99 +++++++++++ docs/source/models/supported_models.rst | 157 ++++++++++++------ docs/source/usage/compatibility_matrix.rst | 12 +- examples/offline_inference_embedding.py | 7 +- ...ine_inference_vision_language_embedding.py | 4 +- tests/compile/test_basic_correctness.py | 4 +- tests/core/test_scheduler_encoder_decoder.py | 2 +- .../openai/test_vision_embedding.py | 2 +- .../embedding/language/test_embedding.py | 2 +- .../models/embedding/language/test_scoring.py | 12 +- .../vision_language/test_dse_qwen2_vl.py | 2 +- .../vision_language/test_llava_next.py | 2 +- .../embedding/vision_language/test_phi3v.py | 2 +- tests/test_config.py | 17 +- vllm/config.py | 137 ++++++++++----- vllm/core/scheduler.py | 2 +- vllm/engine/arg_utils.py | 7 +- vllm/engine/llm_engine.py | 4 +- vllm/entrypoints/llm.py | 53 +++--- vllm/entrypoints/openai/api_server.py | 8 +- vllm/entrypoints/openai/run_batch.py | 4 +- vllm/model_executor/model_loader/utils.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/worker/cpu_worker.py | 2 +- vllm/worker/worker.py | 2 +- 27 files changed, 527 insertions(+), 168 deletions(-) create mode 100644 docs/source/models/generative_models.rst create mode 100644 docs/source/models/pooling_models.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index ebf1361976c5e..842013d6d49c4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -94,6 +94,8 @@ Documentation :caption: Models models/supported_models + models/generative_models + models/pooling_models models/adding_model models/enabling_multimodal_inputs diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst new file mode 100644 index 0000000000000..fb71185600863 --- /dev/null +++ b/docs/source/models/generative_models.rst @@ -0,0 +1,146 @@ +.. _generative_models: + +Generative Models +================= + +vLLM provides first-class support for generative models, which covers most of LLMs. + +In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, +which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text. + +Offline Inference +----------------- + +The :class:`~vllm.LLM` class provides various methods for offline inference. +See :ref:`Engine Arguments ` for a list of options when initializing the model. + +For generative models, the only supported :code:`task` option is :code:`"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + +``LLM.generate`` +^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM. +It is similar to `its counterpart in HF Transformers `__, +except that tokenization and detokenization are also performed automatically. + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + outputs = llm.generate("Hello, my name is") + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +You can optionally control the language generation by passing :class:`~vllm.SamplingParams`. +For example, you can use greedy sampling by setting :code:`temperature=0`: + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + params = SamplingParams(temperature=0) + outputs = llm.generate("Hello, my name is", params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +A code example can be found in `examples/offline_inference.py `_. + +``LLM.beam_search`` +^^^^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.beam_search` method implements `beam search `__ on top of :class:`~vllm.LLM.generate`. +For example, to search using 5 beams and output at most 50 tokens: + +.. code-block:: python + + llm = LLM(model="facebook/opt-125m") + params = BeamSearchParams(beam_width=5, max_tokens=50) + outputs = llm.generate("Hello, my name is", params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +``LLM.chat`` +^^^^^^^^^^^^ + +The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`. +In particular, it accepts input similar to `OpenAI Chat Completions API `__ +and automatically applies the model's `chat template `__ to format the prompt. + +.. important:: + + In general, only instruction-tuned models have a chat template. + Base models may perform poorly as they are not trained to respond to the chat conversation. + +.. code-block:: python + + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + outputs = llm.chat(conversation) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +A code example can be found in `examples/offline_inference_chat.py `_. + +If the model doesn't have a chat template or you want to specify another one, +you can explicitly pass a chat template: + +.. code-block:: python + + from vllm.entrypoints.chat_utils import load_chat_template + + # You can find a list of existing chat templates under `examples/` + custom_template = load_chat_template(chat_template="") + print("Loaded chat template:", custom_template) + + outputs = llm.chat(conversation, chat_template=custom_template) + +Online Inference +---------------- + +Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. +Please click on the above link for more details on how to launch the server. + +Completions API +^^^^^^^^^^^^^^^ + +Our Completions API is similar to ``LLM.generate`` but only accepts text. +It is compatible with `OpenAI Completions API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_completion_client.py `_. + +Chat API +^^^^^^^^ + +Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs `. +It is compatible with `OpenAI Chat Completions API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_chat_completion_client.py `_. diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst new file mode 100644 index 0000000000000..7fa66274c3c5a --- /dev/null +++ b/docs/source/models/pooling_models.rst @@ -0,0 +1,99 @@ +.. _pooling_models: + +Pooling Models +============== + +vLLM also supports pooling models, including embedding, reranking and reward models. + +In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. +These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input +before returning them. + +.. note:: + + We currently support pooling models primarily as a matter of convenience. + As shown in the :ref:`Compatibility Matrix `, most vLLM features are not applicable to + pooling models as they only work on the generation or decode stage, so performance may not improve as much. + +Offline Inference +----------------- + +The :class:`~vllm.LLM` class provides various methods for offline inference. +See :ref:`Engine Arguments ` for a list of options when initializing the model. + +For pooling models, we support the following :code:`task` options: + +- Embedding (:code:`"embed"` / :code:`"embedding"`) +- Classification (:code:`"classify"`) +- Sentence Pair Scoring (:code:`"score"`) +- Reward Modeling (:code:`"reward"`) + +The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used: + +- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. +- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. +- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. +- Reward Modeling: Extract all of the hidden states and return them directly. + +When loading `Sentence Transformers `__ models, +we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`). + +You can customize the model's pooling method via the :code:`override_pooler_config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +``LLM.encode`` +^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. +It returns the aggregated hidden states directly. + +.. code-block:: python + + llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") + outputs = llm.encode("Hello, my name is") + + outputs = model.encode(prompts) + for output in outputs: + embeddings = output.outputs.embedding + print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}") + +A code example can be found in `examples/offline_inference_embedding.py `_. + +``LLM.score`` +^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +It is primarily designed for `cross-encoder models `__. +These types of models serve as rerankers between candidate query-document pairs in RAG systems. + +.. note:: + + vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. + To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. + +You can use `these tests `_ as reference. + +Online Inference +---------------- + +Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. +Please click on the above link for more details on how to launch the server. + +Embeddings API +^^^^^^^^^^^^^^ + +Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs `. + +The text-only API is compatible with `OpenAI Embeddings API `__ +so that you can use OpenAI client to interact with it. +A code example can be found in `examples/openai_embedding_client.py `_. + +The multi-modal API is an extension of the `OpenAI Embeddings API `__ +that incorporates `OpenAI Chat Completions API `__, +so it is not part of the OpenAI standard. Please see :ref:`this page ` for more details on how to use it. + +Score API +^^^^^^^^^ + +Our Score API is similar to ``LLM.score``. +Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 6540e023c1ab0..b9957cf9563b1 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -3,11 +3,21 @@ Supported Models ================ -vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers `_. -This page lists the model architectures that are currently supported by vLLM. +vLLM supports generative and pooling models across various tasks. +If a model supports more than one task, you can set the task via the :code:`--task` argument. + +For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. -For other models, you can check the :code:`config.json` file inside the model repository. +Loading a Model +^^^^^^^^^^^^^^^ + +HuggingFace Hub ++++++++++++++++ + +By default, vLLM loads models from `HuggingFace (HF) Hub `_. + +To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository. If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. .. tip:: @@ -17,38 +27,57 @@ If the :code:`"architectures"` field contains a model architecture listed below, from vllm import LLM - llm = LLM(model=...) # Name or path of your model + # For generative models (task=generate) only + llm = LLM(model=..., task="generate") # Name or path of your model output = llm.generate("Hello, my name is") print(output) - If vLLM successfully generates text, it indicates that your model is supported. + # For pooling models (task={embed,classify,reward}) only + llm = LLM(model=..., task="embed") # Name or path of your model + output = llm.encode("Hello, my name is") + print(output) + + If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` for instructions on how to implement your model in vLLM. Alternatively, you can `open an issue on GitHub `_ to request vLLM support. -.. note:: - To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: +ModelScope +++++++++++ - .. code-block:: shell +To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: - $ export VLLM_USE_MODELSCOPE=True +.. code-block:: shell - And use with :code:`trust_remote_code=True`. + $ export VLLM_USE_MODELSCOPE=True - .. code-block:: python +And use with :code:`trust_remote_code=True`. - from vllm import LLM +.. code-block:: python - llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) + from vllm import LLM + + llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) -Text-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^ + # For generative models (task=generate) only + output = llm.generate("Hello, my name is") + print(output) -Text Generation ---------------- + # For pooling models (task={embed,classify,reward}) only + output = llm.encode("Hello, my name is") + print(output) + +List of Text-only Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Generative Models ++++++++++++++++++ + +See :ref:`this page ` for more information on how to use generative models. + +Text Generation (``--task generate``) +------------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -328,8 +357,24 @@ Text Generation .. note:: Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -Text Embedding --------------- +Pooling Models +++++++++++++++ + +See :ref:`this page ` for more information on how to use pooling models. + +.. important:: + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +Text Embedding (``--task embed``) +--------------------------------- + +Any text generation model can be converted into an embedding model by passing :code:`--task embed`. + +.. note:: + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. .. list-table:: :widths: 25 25 50 5 5 @@ -371,13 +416,6 @@ Text Embedding - - -.. important:: - Some model architectures support both generation and embedding tasks. - In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. - -.. tip:: - You can override the model's pooling method by passing :code:`--override-pooler-config`. - .. note:: :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. @@ -389,8 +427,8 @@ Text Embedding On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. -Reward Modeling ---------------- +Reward Modeling (``--task reward``) +----------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -416,11 +454,8 @@ Reward Modeling For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -.. note:: - As an interim measure, these models are supported in both offline and online inference via Embeddings API. - -Classification ---------------- +Classification (``--task classify``) +------------------------------------ .. list-table:: :widths: 25 25 50 5 5 @@ -437,11 +472,8 @@ Classification - ✅︎ - ✅︎ -.. note:: - As an interim measure, these models are supported in both offline and online inference via Embeddings API. - -Sentence Pair Scoring ---------------------- +Sentence Pair Scoring (``--task score``) +---------------------------------------- .. list-table:: :widths: 25 25 50 5 5 @@ -468,13 +500,10 @@ Sentence Pair Scoring - - -.. note:: - These models are supported in both offline and online inference via Score API. - .. _supported_mm_models: -Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^ +List of Multimodal Language Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The following modalities are supported depending on the model: @@ -491,8 +520,15 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive. - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -Text Generation ---------------- +See :ref:`this page ` on how to pass multi-modal inputs to the model. + +Generative Models ++++++++++++++++++ + +See :ref:`this page ` for more information on how to use generative models. + +Text Generation (``--task generate``) +------------------------------------- .. list-table:: :widths: 25 25 15 20 5 5 5 @@ -696,8 +732,24 @@ Text Generation The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -Multimodal Embedding --------------------- +Pooling Models +++++++++++++++ + +See :ref:`this page ` for more information on how to use pooling models. + +.. important:: + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +Text Embedding (``--task embed``) +--------------------------------- + +Any text generation model can be converted into an embedding model by passing :code:`--task embed`. + +.. note:: + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. .. list-table:: :widths: 25 25 15 25 5 5 @@ -728,12 +780,7 @@ Multimodal Embedding - - ✅︎ -.. important:: - Some model architectures support both generation and embedding tasks. - In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. - -.. tip:: - You can override the model's pooling method by passing :code:`--override-pooler-config`. +---- Model Support Policy ===================== diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst index a93632ff36fb8..04dd72b1e3527 100644 --- a/docs/source/usage/compatibility_matrix.rst +++ b/docs/source/usage/compatibility_matrix.rst @@ -39,13 +39,13 @@ Feature x Feature - :abbr:`prmpt adptr (Prompt Adapter)` - :ref:`SD ` - CUDA graph - - :abbr:`emd (Embedding Models)` + - :abbr:`pooling (Pooling Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` - :abbr:`logP (Logprobs)` - :abbr:`prmpt logP (Prompt Logprobs)` - :abbr:`async output (Async Output Processing)` - multi-step - - :abbr:`mm (Multimodal)` + - :abbr:`mm (Multimodal Inputs)` - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` @@ -151,7 +151,7 @@ Feature x Feature - - - - * - :abbr:`emd (Embedding Models)` + * - :abbr:`pooling (Pooling Models)` - ✗ - ✗ - ✗ @@ -253,7 +253,7 @@ Feature x Feature - - - - * - :abbr:`mm (Multimodal)` + * - :abbr:`mm (Multimodal Inputs)` - ✅ - `✗ `__ - `✗ `__ @@ -386,7 +386,7 @@ Feature x Hardware - ✅ - ✗ - ✅ - * - :abbr:`emd (Embedding Models)` + * - :abbr:`pooling (Pooling Models)` - ✅ - ✅ - ✅ @@ -402,7 +402,7 @@ Feature x Hardware - ✅ - ✅ - ✗ - * - :abbr:`mm (Multimodal)` + * - :abbr:`mm (Multimodal Inputs)` - ✅ - ✅ - ✅ diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py index ae158eef2ca4c..17f6d992073d7 100644 --- a/examples/offline_inference_embedding.py +++ b/examples/offline_inference_embedding.py @@ -9,7 +9,12 @@ ] # Create an LLM. -model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) +model = LLM( + model="intfloat/e5-mistral-7b-instruct", + task="embed", # You should pass task="embed" for embedding models + enforce_eager=True, +) + # Generate embedding. The output is a list of PoolingRequestOutputs. outputs = model.encode(prompts) # Print the outputs. diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index e1732d045f949..bf466109f0981 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -59,7 +59,7 @@ def run_e5_v(query: Query): llm = LLM( model="royokong/e5-v", - task="embedding", + task="embed", max_model_len=4096, ) @@ -88,7 +88,7 @@ def run_vlm2vec(query: Query): llm = LLM( model="TIGER-Lab/VLM2Vec-Full", - task="embedding", + task="embed", trust_remote_code=True, mm_processor_kwargs={"num_crops": 4}, ) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 99781c55b672e..87d5aefea6cb4 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -55,7 +55,7 @@ class TestSetting: # embedding model TestSetting( model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embedding"], + model_args=["--task", "embed"], pp_size=1, tp_size=1, attn_backend="FLASHINFER", @@ -65,7 +65,7 @@ class TestSetting: # encoder-based embedding model (BERT) TestSetting( model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embedding"], + model_args=["--task", "embed"], pp_size=1, tp_size=1, attn_backend="XFORMERS", diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 7cd0416d321ef..16bea54936bc8 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -37,7 +37,7 @@ def test_scheduler_schedule_simple_encoder_decoder(): num_seq_group = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - task="generate", + "generate", max_num_batched_tokens=64, max_num_seqs=num_seq_group, max_model_len=max_model_len, diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 425f2a10ec855..43c63daacb17f 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -27,7 +27,7 @@ def server(): args = [ "--task", - "embedding", + "embed", "--dtype", "bfloat16", "--max-model-len", diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 5ef8540265d14..f458ef5ef556d 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -54,7 +54,7 @@ def test_models( hf_outputs = hf_model.encode(example_prompts) with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, max_model_len=None, **vllm_extra_kwargs) as vllm_model: diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index 30fa5ea7b36c0..0c3115d195fc1 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -35,9 +35,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict([text_pair]).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) @@ -58,9 +56,7 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) @@ -82,9 +78,7 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, - task="embedding", - dtype=dtype, + with vllm_runner(model_name, task="score", dtype=dtype, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 3dd8cb729f8a6..2641987b25a3a 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -93,7 +93,7 @@ def _run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, enforce_eager=True, max_model_len=8192) as vllm_model: diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 693abd7252d5e..f4cd8b81a0d7d 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -47,7 +47,7 @@ def _run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, - task="embedding", + task="embed", dtype=dtype, max_model_len=4096, enforce_eager=True) as vllm_model: diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 6145aff1a5ea2..9374c23dd6ffe 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -39,7 +39,7 @@ def _run_test( # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, task="embedding", dtype=dtype, + with vllm_runner(model, task="embed", dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.encode(input_texts, images=input_images) diff --git a/tests/test_config.py b/tests/test_config.py index 45b0b938af215..4518adfc31bfc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,11 +7,17 @@ from vllm.platforms import current_platform -@pytest.mark.parametrize(("model_id", "expected_task"), [ - ("facebook/opt-125m", "generate"), - ("intfloat/e5-mistral-7b-instruct", "embedding"), -]) -def test_auto_task(model_id, expected_task): +@pytest.mark.parametrize( + ("model_id", "expected_runner_type", "expected_task"), + [ + ("facebook/opt-125m", "generate", "generate"), + ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"), + ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), + ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"), + ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), + ], +) +def test_auto_task(model_id, expected_runner_type, expected_task): config = ModelConfig( model_id, task="auto", @@ -22,6 +28,7 @@ def test_auto_task(model_id, expected_task): dtype="float16", ) + assert config.runner_type == expected_runner_type assert config.task == expected_task diff --git a/vllm/config.py b/vllm/config.py index 2a9f0ebae997d..2d9a76fe7ddb1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -45,13 +45,27 @@ logger = init_logger(__name__) -_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 +_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 -TaskOption = Literal["auto", "generate", "embedding"] +TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", + "score", "reward"] -# "draft" is only used internally for speculative decoding -_Task = Literal["generate", "embedding", "draft"] +_ResolvedTask = Literal["generate", "embed", "classify", "score", "reward", + "draft"] + +RunnerType = Literal["generate", "pooling", "draft"] + +_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { + "generate": ["generate"], + "pooling": ["embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { + task: runner + for runner, tasks in _RUNNER_TASKS.items() for task in tasks +} HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] @@ -144,7 +158,7 @@ class ModelConfig: def __init__( self, model: str, - task: Union[TaskOption, _Task], + task: Union[TaskOption, Literal["draft"]], tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, @@ -295,6 +309,7 @@ def __init__( supported_tasks, task = self._resolve_task(task, self.hf_config) self.supported_tasks = supported_tasks self.task: Final = task + self.pooler_config = self._init_pooler_config(override_pooler_config) self._verify_quantization() @@ -323,7 +338,7 @@ def _init_pooler_config( override_pooler_config: Optional["PoolerConfig"], ) -> Optional["PoolerConfig"]: - if self.task == "embedding": + if self.runner_type == "pooling": user_config = override_pooler_config or PoolerConfig() base_config = get_pooling_config(self.model, self.revision) @@ -357,60 +372,90 @@ def _verify_tokenizer_mode(self) -> None: "either 'auto', 'slow' or 'mistral'.") self.tokenizer_mode = tokenizer_mode + def _get_preferred_task( + self, + architectures: List[str], + supported_tasks: Set[_ResolvedTask], + ) -> Optional[_ResolvedTask]: + model_id = self.model + if get_pooling_config(model_id, self.revision): + return "embed" + if ModelRegistry.is_cross_encoder_model(architectures): + return "score" + + suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [ + # Other models follow this pattern + ("ForCausalLM", "generate"), + ("ForConditionalGeneration", "generate"), + ("ForSequenceClassification", "classify"), + ("ChatModel", "generate"), + ("LMHeadModel", "generate"), + ("EmbeddingModel", "embed"), + ("RewardModel", "reward"), + ] + _, arch = ModelRegistry.inspect_model_cls(architectures) + + for suffix, pref_task in suffix_to_preferred_task: + if arch.endswith(suffix) and pref_task in supported_tasks: + return pref_task + + return None + def _resolve_task( self, - task_option: Union[TaskOption, _Task], + task_option: Union[TaskOption, Literal["draft"]], hf_config: PretrainedConfig, - ) -> Tuple[Set[_Task], _Task]: + ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]: if task_option == "draft": return {"draft"}, "draft" architectures = getattr(hf_config, "architectures", []) - task_support: Dict[_Task, bool] = { + runner_support: Dict[RunnerType, bool] = { # NOTE: Listed from highest to lowest priority, # in case the model supports multiple of them "generate": ModelRegistry.is_text_generation_model(architectures), - "embedding": ModelRegistry.is_pooling_model(architectures), + "pooling": ModelRegistry.is_pooling_model(architectures), } - supported_tasks_lst: List[_Task] = [ - task for task, is_supported in task_support.items() if is_supported + supported_runner_types_lst: List[RunnerType] = [ + runner_type + for runner_type, is_supported in runner_support.items() + if is_supported + ] + + supported_tasks_lst: List[_ResolvedTask] = [ + task for runner_type in supported_runner_types_lst + for task in _RUNNER_TASKS[runner_type] ] supported_tasks = set(supported_tasks_lst) if task_option == "auto": selected_task = next(iter(supported_tasks_lst)) - if len(supported_tasks) > 1: - suffix_to_preferred_task: List[Tuple[str, _Task]] = [ - # Hardcode the models that are exceptions - ("AquilaModel", "generate"), - ("ChatGLMModel", "generate"), - # Other models follow this pattern - ("ForCausalLM", "generate"), - ("ForConditionalGeneration", "generate"), - ("ChatModel", "generate"), - ("LMHeadModel", "generate"), - ("EmbeddingModel", "embedding"), - ("RewardModel", "embedding"), - ("ForSequenceClassification", "embedding"), - ] - info, arch = ModelRegistry.inspect_model_cls(architectures) - - for suffix, pref_task in suffix_to_preferred_task: - if arch.endswith(suffix) and pref_task in supported_tasks: - selected_task = pref_task - break - else: - if (arch.endswith("Model") - and info.architecture.endswith("ForCausalLM") - and "embedding" in supported_tasks): - selected_task = "embedding" + if len(supported_tasks_lst) > 1: + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task is not None: + selected_task = preferred_task logger.info( "This model supports multiple tasks: %s. " "Defaulting to '%s'.", supported_tasks, selected_task) else: + # Aliases + if task_option == "embedding": + preferred_task = self._get_preferred_task( + architectures, supported_tasks) + if preferred_task != "embed": + msg = ("The 'embedding' task will be restricted to " + "embedding models in a future release. Please " + "pass `--task classify`, `--task score`, or " + "`--task reward` explicitly for other pooling " + "models.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + task_option = preferred_task or "embed" + if task_option not in supported_tasks: msg = ( f"This model does not support the '{task_option}' task. " @@ -533,7 +578,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Async postprocessor is not necessary with embedding mode # since there is no token generation - if self.task == "embedding": + if self.runner_type == "pooling": self.use_async_output_proc = False # Reminder: Please update docs/source/usage/compatibility_matrix.rst @@ -750,6 +795,14 @@ def is_cross_encoder(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.is_cross_encoder_model(architectures) + @property + def supported_runner_types(self) -> Set[RunnerType]: + return {_TASK_RUNNER[task] for task in self.supported_tasks} + + @property + def runner_type(self) -> RunnerType: + return _TASK_RUNNER[self.task] + class CacheConfig: """Configuration for the KV cache. @@ -1096,7 +1149,7 @@ def _verify_args(self) -> None: class SchedulerConfig: """Scheduler configuration.""" - task: str = "generate" # The task to use the model for. + runner_type: str = "generate" # The runner type to launch for the model. # Maximum number of tokens to be processed in a single iteration. max_num_batched_tokens: int = field(default=None) # type: ignore @@ -1164,11 +1217,11 @@ def __post_init__(self) -> None: # for higher throughput. self.max_num_batched_tokens = max(self.max_model_len, 2048) - if self.task == "embedding": - # For embedding, choose specific value for higher throughput + if self.runner_type == "pooling": + # Choose specific value for higher throughput self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS, + _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, ) if self.is_multimodal_model: # The value needs to be at least the number of multimodal tokens diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 94c62743883ec..c3bc6becf0995 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -337,7 +337,7 @@ def __init__( self.lora_config = lora_config version = "selfattn" - if (self.scheduler_config.task == "embedding" + if (self.scheduler_config.runner_type == "pooling" or self.cache_config.is_attention_free): version = "placeholder" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7b9adc401abcf..d485c2a9e7208 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1066,7 +1066,7 @@ def create_engine_config(self, if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and model_config.task != "embedding"): + and model_config.runner_type != "pooling"): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " @@ -1083,7 +1083,8 @@ def create_engine_config(self, "errors during the initial memory profiling phase, or result " "in low performance due to small KV cache space. Consider " "setting --max-model-len to a smaller value.", max_model_len) - elif self.enable_chunked_prefill and model_config.task == "embedding": + elif (self.enable_chunked_prefill + and model_config.runner_type == "pooling"): msg = "Chunked prefill is not supported for embedding models" raise ValueError(msg) @@ -1144,7 +1145,7 @@ def create_engine_config(self, " please file an issue with detailed information.") scheduler_config = SchedulerConfig( - task=model_config.task, + runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6eca304b45f07..9be30c635cb2c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -288,7 +288,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.model_executor = executor_class(vllm_config=vllm_config, ) - if self.model_config.task != "embedding": + if self.model_config.runner_type != "pooling": self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. @@ -1123,7 +1123,7 @@ def _process_model_outputs(self, seq_group.metrics.model_execute_time = ( o.model_execute_time) - if self.model_config.task == "embedding": + if self.model_config.runner_type == "pooling": self._process_sequence_group_outputs(seq_group, output) else: self.output_processor.process_prompt_logprob(seq_group, output) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2a02187223a33..0bec978c4869c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -381,19 +381,20 @@ def generate( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - task = self.llm_engine.model_config.task - if task != "generate": + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "generate": messages = [ "LLM.generate() is only supported for (conditional) generation " "models (XForCausalLM, XForConditionalGeneration).", ] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "generate" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "generate" in supported_runner_types: messages.append( - "Your model supports the 'generate' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task generate`.") + "Your model supports the 'generate' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task generate`.") raise ValueError(" ".join(messages)) @@ -793,16 +794,18 @@ def encode( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - task = self.llm_engine.model_config.task - if task != "embedding": - messages = ["LLM.encode() is only supported for embedding models."] + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "pooling": + messages = ["LLM.encode() is only supported for pooling models."] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "embedding" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "pooling" in supported_runner_types: messages.append( - "Your model supports the 'embedding' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task embedding`.") + "Your model supports the 'pooling' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task embed`, " + "`--task classify`, `--task score` etc.") raise ValueError(" ".join(messages)) @@ -864,21 +867,23 @@ def score( A list of ``PoolingRequestOutput`` objects containing the generated scores in the same order as the input prompts. """ - task = self.llm_engine.model_config.task - if task != "embedding": - messages = ["LLM.score() is only supported for embedding models."] + runner_type = self.llm_engine.model_config.runner_type + if runner_type != "pooling": + messages = ["LLM.score() is only supported for pooling models."] - supported_tasks = self.llm_engine.model_config.supported_tasks - if "embedding" in supported_tasks: + supported_runner_types = self.llm_engine.model_config \ + .supported_runner_types + if "pooling" in supported_runner_types: messages.append( - "Your model supports the 'embedding' task, but is " - f"currently initialized for the '{task}' task. Please " - "initialize the model using `--task embedding`.") + "Your model supports the 'pooling' runner, but is " + f"currently initialized for the '{runner_type}' runner. " + "Please initialize vLLM using `--task embed`, " + "`--task classify`, `--task score` etc.") raise ValueError(" ".join(messages)) if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support the cross encoding") + raise ValueError("Your model does not support cross encoding") tokenizer = self.llm_engine.get_tokenizer() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0f93eb54111ad..a345f8caeeed2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -573,7 +573,7 @@ def init_app_state( enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, @@ -582,7 +582,7 @@ def init_app_state( prompt_adapters=args.prompt_adapters, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, @@ -590,13 +590,13 @@ def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if model_config.task == "embedding" else None + ) if model_config.runner_type == "pooling" else None state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, base_model_paths, request_logger=request_logger - ) if (model_config.task == "embedding" \ + ) if (model_config.runner_type == "pooling" \ and model_config.is_cross_encoder) else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 00cdb3b6839f5..675daf54c0d0d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -224,7 +224,7 @@ async def main(args): chat_template=None, chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, - ) if model_config.task == "generate" else None + ) if model_config.runner_type == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, @@ -232,7 +232,7 @@ async def main(args): request_logger=request_logger, chat_template=None, chat_template_content_format="auto", - ) if model_config.task == "embedding" else None + ) if model_config.runner_type == "pooling" else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index cfb89e0f336bc..f15e7176b3d50 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -35,7 +35,7 @@ def get_model_architecture( architectures = ["QuantMixtralForCausalLM"] model_cls, arch = ModelRegistry.resolve_model_cls(architectures) - if model_config.task == "embedding": + if model_config.runner_type == "pooling": model_cls = as_embedding_model(model_cls) return model_cls, arch diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index fdb241e6753fb..55a5c4dff3a5c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -42,7 +42,7 @@ def __init__( executor_class: Type[Executor], usage_context: UsageContext, ): - assert vllm_config.model_config.task != "embedding" + assert vllm_config.model_config.runner_type != "pooling" logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 4fad1a3f4caeb..ba3d4a130a80b 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -163,7 +163,7 @@ def __init__( not in ["medusa", "mlp_speculator", "eagle"]) \ else {"return_hidden_states": True} ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner - if self.model_config.task == "embedding": + if self.model_config.runner_type == "pooling": ModelRunnerClass = CPUPoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 094dd5a5d08b3..832b9903b7abc 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -75,7 +75,7 @@ def __init__( else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if model_config.task == "embedding": + if model_config.runner_type == "pooling": ModelRunnerClass = PoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner From cad5c0a6eda057eeece87a42fff49fef3e18a2ac Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Dec 2024 21:36:27 +0800 Subject: [PATCH 019/357] [Doc] Update docs to refer to pooling models (#11093) Signed-off-by: DarkLight1337 --- docs/source/usage/faq.rst | 7 ++++++- vllm/attention/backends/placeholder_attn.py | 2 +- vllm/config.py | 8 ++++---- vllm/core/placeholder_block_space_manager.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/multiprocessing/client.py | 2 +- vllm/engine/protocol.py | 2 +- vllm/entrypoints/openai/serving_score.py | 2 +- vllm/sequence.py | 6 +++--- vllm/v1/engine/processor.py | 2 +- vllm/worker/cpu_worker.py | 2 +- vllm/worker/hpu_worker.py | 4 ++-- vllm/worker/worker.py | 2 +- 14 files changed, 26 insertions(+), 21 deletions(-) diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.rst index ce327abd5fa20..d88da32092924 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.rst @@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul Q: Which model to use for offline inference embedding? -A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model +A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; +more are listed :ref:`here `. + +By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, +`Mistral-7B-Instruct-v0.3 `__ into embedding models, +but they are expected be inferior to models that are specifically trained on embedding tasks. ---------------------------------------- diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 658039bfc3365..534f79b3a60bf 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -14,7 +14,7 @@ from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -# Placeholder attention backend for models like Mamba and embedding models that +# Placeholder attention backend for models like Mamba and pooling models that # lack attention. diff --git a/vllm/config.py b/vllm/config.py index 2d9a76fe7ddb1..322c8f8990a40 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -152,7 +152,7 @@ class ModelConfig: this argument will be used to configure the neuron config that can not be gathered from the vllm arguments. override_pooler_config: Initialize non default pooling config or - override default pooling config for the embedding model. + override default pooling config for the pooling model. """ def __init__( @@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Async postprocessor is not necessary with embedding mode + # Async postprocessor is not necessary for pooling models # since there is no token generation if self.runner_type == "pooling": self.use_async_output_proc = False @@ -1825,11 +1825,11 @@ class MultiModalConfig: @dataclass class PoolerConfig: - """Controls the behavior of output pooling in embedding models.""" + """Controls the behavior of output pooling in pooling models.""" pooling_type: Optional[str] = None """ - The pooling method of the embedding model. This should be a key in + The pooling method of the pooling model. This should be a key in :class:`vllm.model_executor.layers.pooler.PoolingType`. """ diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 26d42b7f1790e..a47e594518534 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -8,7 +8,7 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager): """A version of BlockSpaceManager for use in environments where block management is not required. - For example: embedding models or attention-free models like Mamba. + For example: pooling models or attention-free models like Mamba. This class provides the same interface as BlockSpaceManager, but its methods perform no actions or return simple values like True in specific diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d485c2a9e7208..7337522bc9952 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--override-pooler-config', type=PoolerConfig.from_json, default=None, - help="Override or set the pooling method in the embedding model. " + help="Override or set the pooling method for pooling models. " "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") parser.add_argument('--compilation-config', @@ -1085,7 +1085,7 @@ def create_engine_config(self, "setting --max-model-len to a smaller value.", max_model_len) elif (self.enable_chunked_prefill and model_config.runner_type == "pooling"): - msg = "Chunked prefill is not supported for embedding models" + msg = "Chunked prefill is not supported for pooling models" raise ValueError(msg) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 60dccd7a0812c..32396fd10188d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1085,7 +1085,7 @@ async def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a729023bc00bb..0a046c71e86e8 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -527,7 +527,7 @@ def encode( *, inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 4079de7d36793..a066836b92708 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -209,7 +209,7 @@ def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model.""" + """Generate outputs for a request from a pooling model.""" ... @abstractmethod diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index fed06fa452955..4929e720c00e4 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -119,7 +119,7 @@ async def create_score( if prompt_adapter_request is not None: raise NotImplementedError("Prompt adapter is not supported " - "for embedding models") + "for scoring models") if isinstance(tokenizer, MistralTokenizer): raise ValueError( diff --git a/vllm/sequence.py b/vllm/sequence.py index 669124319c4f4..b0f3c1cc3609f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -618,9 +618,9 @@ class SequenceGroup: arrival_time: The arrival time of the request. lora_request: LoRA request. embeddings: The embeddings vectors of the prompt of the sequence group - for an embedding model. + for a pooling model. pooling_params: The pooling parameters used to generate the pooling - for an embedding model. + for a pooling model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. trace_headers: OpenTelemetry trace headers. @@ -1102,7 +1102,7 @@ class PoolerOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """The output from a pooling operation in the embedding model.""" + """The output from a pooling operation in the pooling model.""" outputs: List[EmbeddingSequenceGroupOutput] # lazy import to avoid circular import diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 120fc64969552..e0e525b30a767 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -59,7 +59,7 @@ def process_inputs( priority: int = 0, ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: - # TODO(woosuk): Support embedding mode. + # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs # TODO(woosuk): Support encoder-decoder models. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index ba3d4a130a80b..09758a5d9accf 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -178,7 +178,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - # Initialize cpu_cache as embedding models don't initialize kv_caches + # Initialize cpu_cache as pooling models don't initialize kv_caches self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 493f7a9fad098..cca7cd50bfc7b 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -65,8 +65,8 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches - self.hpu_cache: Optional[List[List[torch.tensor]]] = None + # Initialize gpu_cache as pooling models don't initialize kv_caches + self.hpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 832b9903b7abc..a368bb9ee9a5b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -91,7 +91,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches + # Initialize gpu_cache as pooling models don't initialize kv_caches self.gpu_cache: Optional[List[List[torch.Tensor]]] = None self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} From b2f775456e4af7412308320a9c11e4dac3086205 Mon Sep 17 00:00:00 2001 From: hissu-hyvarinen Date: Wed, 11 Dec 2024 17:23:37 +0200 Subject: [PATCH 020/357] [CI/Build] Enable prefix caching test for AMD (#11098) Signed-off-by: Hissu Hyvarinen --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8f57006214c88..df4fa7a6ee9ba 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -201,7 +201,7 @@ steps: - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min - #mirror_hardwares: [amd] + mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/prefix_caching From fd22220687af5ccd89d9f8f2812069ef0422244c Mon Sep 17 00:00:00 2001 From: bingps <46775742+bingps@users.noreply.github.com> Date: Wed, 11 Dec 2024 23:43:24 +0800 Subject: [PATCH 021/357] [Doc] Installed version of llmcompressor for int8/fp8 quantization (#11103) Signed-off-by: Guangda Liu Co-authored-by: Guangda Liu --- docs/source/quantization/fp8.rst | 2 +- docs/source/quantization/int8.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst index aacd07a34ad46..4dbf8e9d346e1 100644 --- a/docs/source/quantization/fp8.rst +++ b/docs/source/quantization/fp8.rst @@ -45,7 +45,7 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the .. code-block:: console - $ pip install llmcompressor==0.1.0 + $ pip install llmcompressor Quantization Process -------------------- diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst index 04fa308449507..aa5b251becb1c 100644 --- a/docs/source/quantization/int8.rst +++ b/docs/source/quantization/int8.rst @@ -19,7 +19,7 @@ To use INT8 quantization with vLLM, you'll need to install the `llm-compressor < .. code-block:: console - $ pip install llmcompressor==0.1.0 + $ pip install llmcompressor Quantization Process -------------------- @@ -142,4 +142,4 @@ Best Practices Troubleshooting and Support --------------------------- -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. \ No newline at end of file +If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. From 91642db952458fbb6ae7c2d167757dc86b105991 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 11 Dec 2024 10:43:05 -0800 Subject: [PATCH 022/357] [torch.compile] use depyf to dump torch.compile internals (#10972) Signed-off-by: youkaichao --- requirements-common.txt | 1 + vllm/compilation/backends.py | 69 ++++++++++++++++++---------------- vllm/compilation/decorators.py | 2 +- vllm/compilation/monitor.py | 23 ++++++++++-- vllm/compilation/wrapper.py | 4 +- vllm/config.py | 6 ++- vllm/worker/model_runner.py | 3 +- 7 files changed, 66 insertions(+), 42 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 792cd58e80669..850b8f4101701 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -33,3 +33,4 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.8.0 # required for compressed-tensors +depyf==0.18.0 # required for profiling and debugging torch.compile diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index f002a8ff905b1..09a3daa731829 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -9,7 +9,7 @@ import torch.fx as fx import vllm.envs as envs -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, VllmConfig from vllm.logger import init_logger from vllm.utils import weak_ref_tensors @@ -149,14 +149,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): """ def __init__(self, module: torch.fx.GraphModule, - compile_submod_names: List[str], - compilation_configs: CompilationConfig, graph_pool): + compile_submod_names: List[str], vllm_config: VllmConfig, + graph_pool): super().__init__(module) from torch._guards import detect_fake_mode self.fake_mode = detect_fake_mode() self.compile_submod_names = compile_submod_names - self.compilation_configs = compilation_configs + self.compilation_config = vllm_config.compilation_config self.graph_pool = graph_pool + self.vllm_config = vllm_config def run(self, *args): fake_args = [ @@ -182,15 +183,15 @@ def call_module(self, target: torch.fx.node.Target, compiled_graph_for_general_shape = wrap_inductor( submod, args, - self.compilation_configs.inductor_compile_config, - self.compilation_configs, + self.compilation_config.inductor_compile_config, + self.compilation_config, graph_index=index, num_graphs=len(self.compile_submod_names), runtime_shape=None, - use_inductor=self.compilation_configs.use_inductor) + use_inductor=self.compilation_config.use_inductor) self.module.__dict__[target] = PiecewiseBackend( - submod, self.compilation_configs, self.graph_pool, index, + submod, self.vllm_config, self.graph_pool, index, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape) @@ -211,7 +212,8 @@ class VllmBackend: which handles the post-grad passes. """ - compilation_configs: CompilationConfig + vllm_config: VllmConfig + compilation_config: CompilationConfig graph_pool: Any _called: bool = False # the graph we compiled @@ -227,7 +229,7 @@ class VllmBackend: def __init__( self, - compilation_configs: CompilationConfig, + vllm_config: VllmConfig, ): global global_graph_pool if global_graph_pool is None: @@ -244,13 +246,14 @@ def __init__( self.sym_tensor_indices = [] self.input_buffers = [] - self.compilation_configs = compilation_configs + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config # `torch.compile` is JIT compiled, so we don't need to # do anything here def configure_post_pass(self): - config = self.compilation_configs + config = self.compilation_config self.post_grad_pass_manager.configure(config.pass_config) # Post-grad custom passes are run using the post_grad_custom_post_pass @@ -271,7 +274,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: from .monitor import torch_compile_start_time dynamo_time = time.time() - torch_compile_start_time logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time) - self.compilation_configs.compilation_time += dynamo_time + self.compilation_config.compilation_time += dynamo_time # we control the compilation process, each instance can only be # called once @@ -281,7 +284,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.configure_post_pass() self.split_gm, self.piecewise_graphs = split_graph( - graph, self.compilation_configs.splitting_ops) + graph, self.compilation_config.splitting_ops) from torch._dynamo.utils import lazy_format_graph_code logger.debug("%s", lazy_format_graph_code("before split", self.graph)) @@ -298,13 +301,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # propagate the split graph to the piecewise backend, # compile submodules with symbolic shapes PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile, - self.compilation_configs, + self.vllm_config, self.graph_pool).run(*example_inputs) self._called = True - if not self.compilation_configs.use_cudagraph or \ - not self.compilation_configs.cudagraph_copy_inputs: + if not self.compilation_config.use_cudagraph or \ + not self.compilation_config.cudagraph_copy_inputs: return self.split_gm # if we need to copy input buffers for cudagraph @@ -364,10 +367,9 @@ class ConcreteSizeEntry: class PiecewiseBackend: - def __init__(self, graph: fx.GraphModule, - compilation_configs: CompilationConfig, graph_pool: Any, - piecewise_compile_index: int, total_piecewise_compiles: int, - sym_shape_indices: List[int], + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: List[int], compiled_graph_for_general_shape: Callable): """ The backend for piecewise compilation. @@ -375,7 +377,7 @@ def __init__(self, graph: fx.GraphModule, We will compile `self.graph` once for the general shape, and then compile for different shapes specified in - `compilation_configs.compile_sizes`. + `compilation_config.compile_sizes`. Independently, we will capture cudagraph for different shapes. @@ -383,7 +385,8 @@ def __init__(self, graph: fx.GraphModule, compile it first, and then capture cudagraph. """ self.graph = graph - self.compilation_configs = compilation_configs + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config self.graph_pool = graph_pool self.piecewise_compile_index = piecewise_compile_index self.total_piecewise_compiles = total_piecewise_compiles @@ -393,10 +396,10 @@ def __init__(self, graph: fx.GraphModule, piecewise_compile_index == total_piecewise_compiles - 1) self.compile_sizes: Set[int] = set( - self.compilation_configs.compile_sizes) + self.compilation_config.compile_sizes) self.capture_sizes: Set[int] = set( - self.compilation_configs.capture_sizes - ) if self.compilation_configs.use_cudagraph else set() + self.compilation_config.capture_sizes + ) if self.compilation_config.use_cudagraph else set() self.first_run_finished = False @@ -423,7 +426,7 @@ def __call__(self, *args) -> Any: self.first_run_finished = True # no specific sizes to compile if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.compilation_configs) + end_monitoring_torch_compile(self.vllm_config) return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] @@ -443,28 +446,28 @@ def __call__(self, *args) -> Any: entry.runnable = wrap_inductor( self.graph, args, - self.compilation_configs.inductor_compile_config, - self.compilation_configs, + self.compilation_config.inductor_compile_config, + self.compilation_config, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, runtime_shape=runtime_shape, - use_inductor=self.compilation_configs.use_inductor) + use_inductor=self.compilation_config.use_inductor) # finished compilations for all required shapes if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.compilation_configs) + end_monitoring_torch_compile(self.vllm_config) if not entry.use_cudagraph: return entry.runnable(*args) if entry.cudagraph is None: - if entry.num_finished_warmup < self.compilation_configs.cudagraph_num_of_warmups: # noqa + if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa entry.num_finished_warmup += 1 if self.is_first_graph: logger.debug( "Warming up %s/%s for shape %s", entry.num_finished_warmup, - self.compilation_configs.cudagraph_num_of_warmups, + self.compilation_config.cudagraph_num_of_warmups, runtime_shape) return entry.runnable(*args) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 938430fe2a501..805a217ee6ca1 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -185,7 +185,7 @@ def __call__(self, *args, **kwargs): "Unsupported dynamic dimensions" f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process - start_monitoring_torch_compile(self.vllm_config.compilation_config) + start_monitoring_torch_compile(self.vllm_config) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 3348674b09af2..b97e40415b41b 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,19 +1,36 @@ +import os import time -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) +context_manager = None torch_compile_start_time: float = 0.0 -def start_monitoring_torch_compile(compilation_config: CompilationConfig): +def start_monitoring_torch_compile(vllm_config: VllmConfig): global torch_compile_start_time torch_compile_start_time = time.time() + compilation_config: CompilationConfig = vllm_config.compilation_config + if compilation_config.level == CompilationLevel.PIECEWISE and \ + compilation_config.debug_dump_path: + import depyf + path = os.path.join(compilation_config.debug_dump_path, + f"rank_{vllm_config.parallel_config.rank}") + global context_manager + context_manager = depyf.prepare_debug(path) + context_manager.__enter__() -def end_monitoring_torch_compile(compilation_config: CompilationConfig): + +def end_monitoring_torch_compile(vllm_config: VllmConfig): + compilation_config: CompilationConfig = vllm_config.compilation_config if compilation_config.level == CompilationLevel.PIECEWISE: logger.info("torch.compile takes %.2f s in total", compilation_config.compilation_time) + global context_manager + if context_manager is not None: + context_manager.__exit__(None, None, None) + context_manager = None diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index bc4d292fef402..c10241b483169 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -32,8 +32,8 @@ def __init__(self, # default compilation settings # compiling the forward method - backend = get_current_vllm_config( - ).compilation_config.init_backend() + vllm_config = get_current_vllm_config() + backend = vllm_config.compilation_config.init_backend(vllm_config) compiled_callable = torch.compile( self.forward, diff --git a/vllm/config.py b/vllm/config.py index 322c8f8990a40..7f9be5a3a98bc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2222,6 +2222,7 @@ class CompilationConfig(BaseModel): - 1: dynamo as is. - 2: dynamo once. - 3: piecewise compilation. + - debug_dump_path: the path to dump the debug information. - backend: the backend for compilation. It needs to be a string. - "" (empty string): use the default backend. - "eager"/"openxla"/...: use the specified backend registered in PyTorch. @@ -2289,6 +2290,7 @@ class CompilationConfig(BaseModel): certain small batchsizes, where inductor is good at optimizing. """ # noqa level: int = 0 + debug_dump_path: str = "" backend: str = "" custom_ops: List[str] = Field(default_factory=list) splitting_ops: List[str] = Field(default_factory=lambda: [ @@ -2394,7 +2396,7 @@ def model_post_init(self, __context: Any) -> None: self.static_forward_context = {} self.compilation_time = 0.0 - def init_backend(self) -> Union[str, Callable]: + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -2413,7 +2415,7 @@ def init_backend(self) -> Union[str, Callable]: # merge with the config use_inductor assert self.level == CompilationLevel.PIECEWISE from vllm.compilation.backends import VllmBackend - return VllmBackend(self) + return VllmBackend(vllm_config) def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): """To complete the initialization of config, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 551b84435fdc0..26fd486130ce6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1162,7 +1162,8 @@ def load_model(self) -> None: if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): - backend = self.vllm_config.compilation_config.init_backend() + backend = self.vllm_config.compilation_config.init_backend( + self.vllm_config) self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, From d643c2aba1cd5421200f3a3bad1813dd067233b4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 11 Dec 2024 10:49:23 -0800 Subject: [PATCH 023/357] [V1] Use input_ids as input for text-only models (#11032) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8d9976ded7c5e..e75be21ef2d91 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,6 +61,7 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] + self.is_multimodal_model = model_config.is_multimodal_model self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len @@ -103,6 +104,11 @@ def __init__( # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( reversed(self.vllm_config.compilation_config.capture_sizes)) + + # Persistent buffers for CUDA graphs. + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - input_ids = input_ids.to(self.device, non_blocking=True) + self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, + non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) query_start_loc = query_start_loc.to(self.device, non_blocking=True) @@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return input_ids, attn_metadata, logits_indices + return attn_metadata, logits_indices def _prepare_sampling( self, @@ -427,13 +434,15 @@ def execute_model( ) -> ModelRunnerOutput: self._update_states(scheduler_output) - # Run the encoder. - self._execute_encoder(scheduler_output) - encoder_outputs = self._gather_encoder_outputs(scheduler_output) + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + else: + encoder_outputs = [] # Prepare the decoder inputs. - input_ids, attn_metadata, logits_indices = self._prepare_inputs( - scheduler_output) + attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -444,29 +453,39 @@ def execute_model( else: # Eager mode. num_input_tokens = num_scheduled_tokens - attn_metadata.num_input_tokens = num_input_tokens - # Get the inputs embeds. - if encoder_outputs: - inputs_embeds = self.model.get_input_embeddings( - input_ids, encoder_outputs) + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_scheduled_tokens] + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None else: - inputs_embeds = self.model.get_input_embeddings(input_ids) - # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), - # always use embeddings (rather than token ids) as input to the model. - # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_input_tokens], + inputs_embeds=inputs_embeds, ) hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] @@ -534,13 +553,20 @@ def _dummy_run( num_tokens: int, kv_caches: List[torch.Tensor], ) -> torch.Tensor: + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None with set_forward_context(None, self.vllm_config): hidden_states = model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_tokens], kv_caches=kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_tokens]) + inputs_embeds=inputs_embeds, + ) return hidden_states def profile_run(self) -> None: From 66aaa7722df3d7ef9e9bd2942cab5cd0d7473174 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 11 Dec 2024 10:59:50 -0800 Subject: [PATCH 024/357] [torch.compile] remove graph logging in ci (#11110) Signed-off-by: youkaichao --- vllm/compilation/backends.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 09a3daa731829..4a5dc337d01b8 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -287,9 +287,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: graph, self.compilation_config.splitting_ops) from torch._dynamo.utils import lazy_format_graph_code - logger.debug("%s", lazy_format_graph_code("before split", self.graph)) - logger.debug("%s", lazy_format_graph_code("after split", - self.split_gm)) + + # depyf will hook lazy_format_graph_code and dump the graph + # for debugging, no need to print the graph here + lazy_format_graph_code("before split", self.graph) + lazy_format_graph_code("after split", self.split_gm) compilation_counter.num_piecewise_graphs_seen += len( self.piecewise_graphs) From 72ff3a968682e6a3f7620ab59f2baf5e8eb2777b Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:36:35 -0800 Subject: [PATCH 025/357] [core] Bump ray to use _overlap_gpu_communication in compiled graph tests (#10410) Signed-off-by: Rui Qiao Signed-off-by: Rui Qiao Co-authored-by: Rui Qiao --- requirements-test.in | 2 +- requirements-test.txt | 2 +- vllm/envs.py | 8 ++++++++ vllm/executor/ray_gpu_executor.py | 17 ++++++++++------- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/requirements-test.in b/requirements-test.in index c0b228148ab31..57fddb416317e 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -13,7 +13,7 @@ einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests peft -ray[adag]==2.35 +ray[adag]==2.40.0 sentence-transformers # required for embedding tests soundfile # required for audio tests timm # required for internvl test diff --git a/requirements-test.txt b/requirements-test.txt index 8ceb705cdffd7..c786a1249bddb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -410,7 +410,7 @@ pyyaml==6.0.2 # ray # timm # transformers -ray[adag]==2.35.0 +ray[adag]==2.40.0 # via -r requirements-test.in redis==5.2.0 # via tensorizer diff --git a/vllm/envs.py b/vllm/envs.py index be5d9985b63a4..bc8c1499e9534 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -45,6 +45,7 @@ VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -337,6 +338,13 @@ def get_default_config_root(): lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1")) ), + # If the env var is set, it enables GPU communication overlap in + # Ray's compiled DAG. This flag is ignored if + # VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1")) + ), + # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 4263fb27265f6..4bf5cbbd18ffe 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -414,12 +414,10 @@ def _check_ray_adag_installation(self): import pkg_resources from packaging import version - required_version = version.parse("2.35") + required_version = version.parse("2.40") current_version = version.parse( pkg_resources.get_distribution("ray").version) - # TODO: update the constraint once we adapt to the backward - # incompatible API change from ray 2.36 - if current_version != required_version: + if current_version < required_version: raise ValueError(f"Ray version {required_version} is " f"required, but found {current_version}") @@ -445,6 +443,8 @@ def _compiled_ray_dag(self, enable_asyncio: bool): logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s", envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) + logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s", + envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) with InputNode() as input_data: # Example DAG: PP=2, TP=4 # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 @@ -480,7 +480,10 @@ def _compiled_ray_dag(self, enable_asyncio: bool): forward_dag = MultiOutputNode(outputs) - return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) + return forward_dag.experimental_compile( + enable_asyncio=enable_asyncio, + _overlap_gpu_communication=envs. + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) def __del__(self): self.shutdown() @@ -507,8 +510,8 @@ async def execute_model_async( serialized_data = self.input_encoder.encode(execute_model_req) dag_future = await self.forward_dag.execute_async(serialized_data) - outputs = await dag_future - return self.output_decoder.decode(outputs[0]) + output = await dag_future[0] + return self.output_decoder.decode(output) async def _driver_execute_model_async( self, From d1e21a979bba4712f48dac1bbf410e0b57c92e7a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 12 Dec 2024 06:18:16 +0800 Subject: [PATCH 026/357] [CI/Build] Split up VLM tests (#11083) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 32 ++++++--- pyproject.toml | 3 +- .../vision_language/test_models.py | 72 ++++++++++++------- tests/utils.py | 37 ++++++---- 4 files changed, 94 insertions(+), 50 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index df4fa7a6ee9ba..aca505178df06 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,7 @@ steps: ##### models test ##### -- label: Basic Models Test # 30min +- label: Basic Models Test # 24min source_file_dependencies: - vllm/ - tests/models @@ -331,7 +331,7 @@ steps: - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py -- label: Language Models Test (Standard) # 42min +- label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -342,7 +342,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model -- label: Language Models Test (Extended) # 50min +- label: Language Models Test (Extended) # 1h10min optional: true source_file_dependencies: - vllm/ @@ -353,7 +353,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 26min +- label: Multi-Modal Models Test (Standard) # 28min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -369,7 +369,7 @@ steps: - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Multi-Modal Models Test (Extended) # 1h15m +- label: Multi-Modal Models Test (Extended) 1 # 1h16m optional: true source_file_dependencies: - vllm/ @@ -380,14 +380,24 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' +- label: Multi-Modal Models Test (Extended) 2 # 38m + optional: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/vision_language + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' + # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test optional: true @@ -446,11 +456,11 @@ steps: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus + - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py @@ -540,7 +550,7 @@ steps: # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional diff --git a/pyproject.toml b/pyproject.toml index 253b706a774a7..c5a14ecf5aea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,8 @@ markers = [ "core_model: enable this model test in each PR instead of only nightly", "cpu_model: enable this model test in CPU tests", "quant_model: run this model test under Quantized category", - "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", + "split: run this test as part of a split", + "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index ed8f34a677f84..3101d1d2ea831 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -1,7 +1,9 @@ """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ +import math import os +from collections import defaultdict from pathlib import PosixPath from typing import Type @@ -10,11 +12,12 @@ from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform -from vllm.utils import cuda_device_count_stateless, identity +from vllm.utils import identity from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, _VideoAssets) -from ....utils import fork_new_process_for_each_test, large_gpu_mark +from ....utils import (fork_new_process_for_each_test, large_gpu_mark, + multi_gpu_marks) from ...utils import check_outputs_equal from .vlm_utils import custom_inputs, model_utils, runners from .vlm_utils.case_filtering import get_parametrized_options @@ -382,7 +385,7 @@ prompt_path_encoder=model_utils.qwen_prompt_path_encoder, ), ### Tensor parallel / multi-gpu broadcast tests - "broadcast-chameleon": VLMTestInfo( + "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, @@ -393,43 +396,25 @@ vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2], comparator=check_outputs_equal, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ), - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), - "broadcast-llava": VLMTestInfo( + "llava-broadcast": VLMTestInfo( models=["llava-hf/llava-1.5-7b-hf"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ) - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), - "broadcast-llava_next": VLMTestInfo( + "llava_next-broadcast": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", max_model_len=10240, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - marks=[ - pytest.mark.distributed_2_gpus, - pytest.mark.skipif( - cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.", - ) - ], + marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore ), ### Custom input edge-cases for specific models @@ -468,6 +453,41 @@ # yapf: enable +def _mark_splits( + test_settings: dict[str, VLMTestInfo], + *, + num_groups: int, +) -> dict[str, VLMTestInfo]: + name_by_test_info_id = {id(v): k for k, v in test_settings.items()} + test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list) + + for info in test_settings.values(): + for model in info.models: + test_infos_by_model[model].append(info) + + models = sorted(test_infos_by_model.keys()) + split_size = math.ceil(len(models) / num_groups) + + new_test_settings = dict[str, VLMTestInfo]() + + for i in range(num_groups): + models_in_group = models[i * split_size:(i + 1) * split_size] + + for model in models_in_group: + for info in test_infos_by_model[model]: + new_marks = (info.marks or []) + [pytest.mark.split(group=i)] + new_info = info._replace(marks=new_marks) + new_test_settings[name_by_test_info_id[id(info)]] = new_info + + missing_keys = test_settings.keys() - new_test_settings.keys() + assert not missing_keys, f"Missing keys: {missing_keys}" + + return new_test_settings + + +VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) + + ### Test wrappers # Wrappers around the core test running func for: # - single image diff --git a/tests/utils.py b/tests/utils.py index a893667e144a6..afeb708f3bcdc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -682,10 +682,12 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: - """Gets a pytest skipif mark, which triggers ig the the device doesn't have - meet a minimum memory requirement in gb; can be leveraged via - @large_gpu_test to skip tests in environments without enough resources, or - called when filtering tests to run directly. + """ + Get a pytest mark, which skips the test if the GPU doesn't meet + a minimum memory requirement in GB. + + This can be leveraged via `@large_gpu_test` to skip tests in environments + without enough resources, or called when filtering tests to run directly. """ try: if current_platform.is_cpu(): @@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int): Currently, the CI machine uses L4 GPU which has 24 GB VRAM. """ - test_skipif = large_gpu_mark(min_gb) + mark = large_gpu_mark(min_gb) def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - return test_skipif(f) + return mark(f) return wrapper -def multi_gpu_test(*, num_gpus: int): - """ - Decorate a test to be run only when multiple GPUs are available. - """ - test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus") +def multi_gpu_marks(*, num_gpus: int): + """Get a collection of pytest marks to apply for `@multi_gpu_test`.""" + test_selector = pytest.mark.distributed(num_gpus=num_gpus) test_skipif = pytest.mark.skipif( cuda_device_count_stateless() < num_gpus, reason=f"Need at least {num_gpus} GPUs to run the test.", ) + return [test_selector, test_skipif] + + +def multi_gpu_test(*, num_gpus: int): + """ + Decorate a test to be run only when multiple GPUs are available. + """ + marks = multi_gpu_marks(num_gpus=num_gpus) + def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - return test_selector(test_skipif(fork_new_process_for_each_test(f))) + func = fork_new_process_for_each_test(f) + for mark in reversed(marks): + func = mark(func) + + return func return wrapper From 452a723bf2e8410ee9b47f82f90c7ea48aa6d14f Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 11 Dec 2024 18:34:54 -0500 Subject: [PATCH 027/357] [V1][Core] Remove should_shutdown to simplify core process termination (#11113) Signed-off-by: Tyler Michael Smith --- vllm/v1/engine/core.py | 13 ++----------- vllm/v1/engine/core_client.py | 6 ------ 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 55a5c4dff3a5c..a26ffe74a3ae8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -5,7 +5,6 @@ import threading import time from multiprocessing.process import BaseProcess -from multiprocessing.sharedctypes import Synchronized from typing import List, Tuple, Type, Union import zmq @@ -133,13 +132,9 @@ def __init__( input_path: str, output_path: str, ready_path: str, - should_shutdown: Synchronized, ): super().__init__(vllm_config, executor_class, usage_context) - # Signal from main process to shutdown (multiprocessing.Value). - self.should_shutdown = should_shutdown - # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, # and to overlap some serialization/deserialization with the @@ -195,7 +190,6 @@ def make_engine_core_process( input_path: str, output_path: str, ready_path: str, - should_shutdown: Synchronized, ) -> BaseProcess: # The current process might have CUDA context, # so we need to spawn a new process. @@ -210,7 +204,6 @@ def make_engine_core_process( "vllm_config": vllm_config, "executor_class": executor_class, "usage_context": usage_context, - "should_shutdown": should_shutdown } # Run EngineCore busy loop in background process. proc = context.Process(target=EngineCoreProc.run_engine_core, @@ -260,8 +253,8 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - # Loop until we get a shutdown signal. - while not self.should_shutdown: + # Loop until process is sent a SIGINT or SIGTERM + while True: # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: @@ -272,8 +265,6 @@ def run_busy_loop(self): except queue.Empty: self._log_stats() logger.debug("EngineCore busy loop waiting.") - if self.should_shutdown: - return except BaseException: raise diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4d96b323d1662..1d5ddf4db4d7c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,4 @@ import atexit -import multiprocessing from typing import List, Union import msgspec @@ -149,21 +148,16 @@ def __init__( self.input_socket.bind(input_path) # Start EngineCore in background process. - self.should_shutdown = multiprocessing.Value('b', False, lock=False) self.proc = EngineCoreProc.make_engine_core_process( *args, input_path=input_path, output_path=output_path, ready_path=ready_path, - should_shutdown=self.should_shutdown, **kwargs, ) atexit.register(self.shutdown) def shutdown(self): - # Send shutdown signal to background process. - self.should_shutdown = True - # Shut down the zmq context. self.ctx.destroy(linger=0) From 4e116833686f3e0c0a223b05b5859ad76843a017 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Wed, 11 Dec 2024 19:55:30 -0500 Subject: [PATCH 028/357] [V1] VLM preprocessor hashing (#11020) Signed-off-by: Roger Wang Signed-off-by: Alexander Matveev Co-authored-by: Michael Goin Co-authored-by: Roger Wang --- examples/offline_inference_vision_language.py | 126 ++++++++++++-- requirements-common.txt | 1 + tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + vllm/config.py | 10 +- vllm/engine/arg_utils.py | 8 + vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/core.py | 18 +- vllm/v1/engine/mm_input_mapper.py | 156 ++++++++++++++++-- vllm/v1/engine/processor.py | 35 ++-- vllm/v1/utils.py | 21 +++ 11 files changed, 332 insertions(+), 48 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index c6a274ee5894b..5e210126dc8fe 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -5,6 +5,8 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ +import random + from transformers import AutoTokenizer from vllm import LLM, SamplingParams @@ -23,7 +25,9 @@ def run_llava(question: str, modality: str): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096) + llm = LLM(model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -33,7 +37,9 @@ def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -44,7 +50,9 @@ def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: